diff options
Diffstat (limited to 'sysdeps/ia64')
163 files changed, 80389 insertions, 97 deletions
diff --git a/sysdeps/ia64/fpu/Dist b/sysdeps/ia64/fpu/Dist new file mode 100644 index 0000000000..ae51e76a8b --- /dev/null +++ b/sysdeps/ia64/fpu/Dist @@ -0,0 +1,6 @@ +libm_atan2_reg.S +libm_error.c +libm_reduce.S +libm_support.h +s_matherrf +s_matherrl diff --git a/sysdeps/ia64/fpu/Makefile b/sysdeps/ia64/fpu/Makefile new file mode 100644 index 0000000000..e5237ffa84 --- /dev/null +++ b/sysdeps/ia64/fpu/Makefile @@ -0,0 +1,7 @@ +ifeq ($(subdir),math) +libm-sysdep_routines += libm_atan2_reg s_matherrf s_matherrl libm_reduce \ + libm_tan + +routines += libm_frexp4 libm_frexp4f libm_frexp4l libm_error +CPPFLAGS += -DSIZE_INT_32 +endif diff --git a/sysdeps/ia64/fpu/Versions b/sysdeps/ia64/fpu/Versions new file mode 100644 index 0000000000..6e46589cee --- /dev/null +++ b/sysdeps/ia64/fpu/Versions @@ -0,0 +1,10 @@ +libc { + GLIBC_2.2.3 { + __libm_frexp_4; __libm_frexp_4f; __libm_frexp_4l; __libm_error_support; + } +} +libm { + GLIBC_2.2.3 { + matherrf; matherrl; + } +} diff --git a/sysdeps/ia64/fpu/bits/mathdef.h b/sysdeps/ia64/fpu/bits/mathdef.h index 90c1e89f87..ad3b1686f0 100644 --- a/sysdeps/ia64/fpu/bits/mathdef.h +++ b/sysdeps/ia64/fpu/bits/mathdef.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2000 Free Software Foundation, Inc. +/* Copyright (C) 2000, 2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,19 +23,18 @@ #if defined __USE_ISOC99 && defined _MATH_H && !defined _MATH_H_MATHDEF # define _MATH_H_MATHDEF 1 -/* The ix87 FPUs evaluate all values in the 80 bit floating-point format - which is also available for the user as `long double'. Therefore we - define: */ -typedef long double float_t; /* `float' expressions are evaluated as - `long double'. */ -typedef long double double_t; /* `double' expressions are evaluated as - `long double'. */ +/* The IA-64 architecture computes values with the precision of the + used type. */ +typedef float float_t; /* `float' expressions are evaluated as + `float'. */ +typedef double double_t; /* `double' expressions are evaluated as + `double'. */ /* Define `INFINITY' as value of type `float'. */ # define INFINITY HUGE_VALF /* The values returned by `ilogb' for 0 and NaN respectively. */ # define FP_ILOGB0 (-2147483647 - 1) -# define FP_ILOGBNAN (-2147483647 - 1) +# define FP_ILOGBNAN 2147483647 #endif /* ISO C99 */ diff --git a/sysdeps/ia64/fpu/e_acos.S b/sysdeps/ia64/fpu/e_acos.S new file mode 100644 index 0000000000..1d8085c989 --- /dev/null +++ b/sysdeps/ia64/fpu/e_acos.S @@ -0,0 +1,904 @@ +.file "acos.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 2/02/00 Initial version +// 8/17/00 New and much faster algorithm. +// 8/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths, +// fixed mfb split issue stalls. + +// Description +//========================================= +// The acos function computes the principle value of the arc sine of x. +// A doman error occurs for arguments not in the range [-1,+1]. + +// The acos function returns the arc cosine in the range [0, +pi] radians. +// acos(1) returns +0, acos(-1) returns pi, acos(0) returns pi/2. +// acos(x) returns a Nan and raises the invalid exception for |x| >1 + +// The acos function is just like asin except that pi/2 is added at the end. + +// +// Assembly macros +//========================================= + +#include "libm_support.h" + +// predicate registers +//acos_pred_LEsqrt2by2 = p7 +//acos_pred_GTsqrt2by2 = p8 + +// integer registers +ASIN_Addr1 = r33 +ASIN_Addr2 = r34 +ASIN_FFFE = r35 + +GR_SAVE_B0 = r36 +GR_SAVE_PFS = r37 +GR_SAVE_GP = r38 + +GR_Parameter_X = r39 +GR_Parameter_Y = r40 +GR_Parameter_RESULT = r41 +GR_Parameter_Tag = r42 + +// floating point registers +acos_coeff_P1 = f32 +acos_coeff_P2 = f33 +acos_coeff_P3 = f34 +acos_coeff_P4 = f35 + +acos_coeff_P5 = f36 +acos_coeff_P6 = f37 +acos_coeff_P7 = f38 +acos_coeff_P8 = f39 +acos_coeff_P9 = f40 + +acos_coeff_P10 = f41 +acos_coeff_P11 = f42 +acos_coeff_P12 = f43 +acos_coeff_P13 = f44 +acos_coeff_P14 = f45 + +acos_coeff_P15 = f46 +acos_coeff_P16 = f47 +acos_coeff_P17 = f48 +acos_coeff_P18 = f49 +acos_coeff_P19 = f50 + +acos_coeff_P20 = f51 +acos_coeff_P21 = f52 +acos_const_sqrt2by2 = f53 +acos_const_piby2 = f54 +acos_abs_x = f55 + +acos_tx = f56 +acos_tx2 = f57 +acos_tx3 = f58 +acos_tx4 = f59 +acos_tx8 = f60 + +acos_tx11 = f61 +acos_1poly_p8 = f62 +acos_1poly_p19 = f63 +acos_1poly_p4 = f64 +acos_1poly_p15 = f65 + +acos_1poly_p6 = f66 +acos_1poly_p17 = f67 +acos_1poly_p0 = f68 +acos_1poly_p11 = f69 +acos_1poly_p2 = f70 + +acos_1poly_p13 = f71 +acos_series_tx = f72 +acos_t = f73 +acos_t2 = f74 +acos_t3 = f75 + +acos_t4 = f76 +acos_t8 = f77 +acos_t11 = f78 +acos_poly_p8 = f79 +acos_poly_p19 = f80 + +acos_poly_p4 = f81 +acos_poly_p15 = f82 +acos_poly_p6 = f83 +acos_poly_p17 = f84 +acos_poly_p0 = f85 + +acos_poly_p11 = f86 +acos_poly_p2 = f87 +acos_poly_p13 = f88 +acos_series_t = f89 +acos_1by2 = f90 + +acos_3by2 = f91 +acos_5by2 = f92 +acos_11by4 = f93 +acos_35by8 = f94 +acos_63by8 = f95 + +acos_231by16 = f96 +acos_y0 = f97 +acos_H0 = f98 +acos_S0 = f99 +acos_d = f100 + +acos_l1 = f101 +acos_d2 = f102 +acos_T0 = f103 +acos_d1 = f104 +acos_e0 = f105 + +acos_l2 = f106 +acos_d3 = f107 +acos_T3 = f108 +acos_S1 = f109 +acos_e1 = f110 + +acos_z = f111 +answer2 = f112 +acos_sgn_x = f113 +acos_429by16 = f114 +acos_18by4 = f115 + +acos_3by4 = f116 +acos_l3 = f117 +acos_T6 = f118 +acos_const_add = f119 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +acos_coeff_1_table: +ASM_TYPE_DIRECTIVE(acos_coeff_1_table,@object) +data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7 +data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18 +data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5 +data8 0xF75E381A323D4D94 , 0x0000C002 //P16 +data8 0x8959C2629C1024C0 , 0x0000C002 //P20 +data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9 +data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3 +data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14 +data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12 +data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1 +data8 0xF534912FF3E7B76F , 0x00003FFF //P21 +data8 0xc90fdaa22168c235 , 0x00003fff // pi/2 +data8 0x0000000000000000 , 0x00000000 // pad to avoid bank conflicts +ASM_SIZE_DIRECTIVE(acos_coeff_1_table) + + +acos_coeff_2_table: +ASM_TYPE_DIRECTIVE(acos_coeff_2_table,@object) +data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6 +data8 0xB4F118A4B1015470 , 0x00004003 //P17 +data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4 +data8 0x80F50489AEF1CAC6 , 0x00004002 //P15 +data8 0x92728015172CFE1C , 0x00004003 //P19 +data8 0xBBC3D831D4595971 , 0x00003FF8 //P8 +data8 0x999999999952A5C3 , 0x00003FFB //P2 +data8 0x855576BE6F0975EC , 0x00003FFF //P13 +data8 0xF12420E778077D89 , 0x00003FFA //P11 +data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10 +data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2 +ASM_SIZE_DIRECTIVE(acos_coeff_2_table) + + +.align 32 +.global acos +ASM_TYPE_DIRECTIVE(acos,@function) + +.section .text +.proc acos +.align 32 + + +acos: + +{ .mfi + alloc r32 = ar.pfs,1,6,4,0 + fma.s1 acos_tx = f8,f8,f0 + addl ASIN_Addr2 = @ltoff(acos_coeff_2_table),gp +} +{ .mfi + mov ASIN_FFFE = 0xFFFE + fnma.s1 acos_t = f8,f8,f1 + addl ASIN_Addr1 = @ltoff(acos_coeff_1_table),gp +} +;; + + +{ .mfi + setf.exp acos_1by2 = ASIN_FFFE + fmerge.s acos_abs_x = f1,f8 + nop.i 999 ;; +} + + +{ .mmf + ld8 ASIN_Addr1 = [ASIN_Addr1] + ld8 ASIN_Addr2 = [ASIN_Addr2] + fmerge.s acos_sgn_x = f8,f1 +} +;; + + +{ .mfi + nop.m 999 + fcmp.lt.s1 p11,p12 = f8, f0 + nop.i 999 ;; +} + + +{ .mfi + ldfe acos_coeff_P7 = [ASIN_Addr1],16 + fma.s1 acos_tx2 = acos_tx,acos_tx,f0 + nop.i 999 +} +{ .mfi + ldfe acos_coeff_P6 = [ASIN_Addr2],16 + fma.s1 acos_t2 = acos_t,acos_t,f0 + nop.i 999;; +} + + +{ .mmf + ldfe acos_coeff_P18 = [ASIN_Addr1],16 + ldfe acos_coeff_P17 = [ASIN_Addr2],16 + fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan +} +;; + + +{ .mmf + ldfe acos_coeff_P5 = [ASIN_Addr1],16 + ldfe acos_coeff_P4 = [ASIN_Addr2],16 + frsqrta.s1 acos_y0,p0 = acos_t +} +;; + + +{ .mfi + ldfe acos_coeff_P16 = [ASIN_Addr1],16 + fcmp.gt.s1 p9,p0 = acos_abs_x,f1 + nop.i 999 +} +{ .mfb + ldfe acos_coeff_P15 = [ASIN_Addr2],16 +(p8) fma.d f8 = f8,f1,f0 +(p8) br.ret.spnt b0 +} +;; + + +{ .mmf + ldfe acos_coeff_P20 = [ASIN_Addr1],16 + ldfe acos_coeff_P19 = [ASIN_Addr2],16 + fclass.m.unc p10,p0 = f8, 0x07 //@zero +} +;; + + +{ .mfi + ldfe acos_coeff_P9 = [ASIN_Addr1],16 + fma.s1 acos_t4 = acos_t2,acos_t2,f0 +(p9) mov GR_Parameter_Tag = 58 +} +{ .mfi + ldfe acos_coeff_P8 = [ASIN_Addr2],16 + fma.s1 acos_3by2 = acos_1by2,f1,f1 + nop.i 999;; +} + + +{ .mfi + ldfe acos_coeff_P2 = [ASIN_Addr2],16 + fma.s1 acos_tx4 = acos_tx2,acos_tx2,f0 + nop.i 999 +} +{ .mfb + ldfe acos_coeff_P3 = [ASIN_Addr1],16 + fma.s1 acos_t3 = acos_t,acos_t2,f0 +(p9) br.cond.spnt __libm_error_region +} +;; + + +{ .mfi + ldfe acos_coeff_P13 = [ASIN_Addr2],16 + fma.s1 acos_H0 = acos_y0,acos_1by2,f0 + nop.i 999 +} +{ .mfi + ldfe acos_coeff_P14 = [ASIN_Addr1],16 + fma.s1 acos_S0 = acos_y0,acos_t,f0 + nop.i 999;; +} + + +{ .mfi + ldfe acos_coeff_P11 = [ASIN_Addr2],16 + fcmp.eq.s1 p6,p0 = acos_abs_x, f1 + nop.i 999 +} +{ .mfi + ldfe acos_coeff_P12 = [ASIN_Addr1],16 + fma.s1 acos_tx3 = acos_tx,acos_tx2,f0 + nop.i 999 +} +;; + + +{ .mfi + ldfe acos_coeff_P10 = [ASIN_Addr2],16 + fma.s1 acos_1poly_p6 = acos_tx,acos_coeff_P7,acos_coeff_P6 + nop.i 999 +} +{ .mfi + ldfe acos_coeff_P1 = [ASIN_Addr1],16 + fma.s1 acos_poly_p6 = acos_t,acos_coeff_P7,acos_coeff_P6 + nop.i 999;; +} + + +{ .mfi + ldfe acos_const_sqrt2by2 = [ASIN_Addr2],16 + fma.s1 acos_5by2 = acos_3by2,f1,f1 + nop.i 999 +} +{ .mfi + ldfe acos_coeff_P21 = [ASIN_Addr1],16 + fma.s1 acos_11by4 = acos_3by2,acos_3by2,acos_1by2 + nop.i 999;; +} + + +{ .mfi + ldfe acos_const_piby2 = [ASIN_Addr1],16 + fma.s1 acos_poly_p17 = acos_t,acos_coeff_P18,acos_coeff_P17 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 acos_3by4 = acos_3by2,acos_1by2,f0 +(p10) br.cond.spnt L(ACOS_ZERO) // Branch to short path if x=0 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p15 = acos_t,acos_coeff_P16,acos_coeff_P15 + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 acos_d = acos_S0,acos_H0,acos_1by2 +(p6) br.cond.spnt L(ACOS_ABS_ONE) // Branch to short path if |x|=1 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p19 = acos_t,acos_coeff_P20,acos_coeff_P19 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p4 = acos_t,acos_coeff_P5,acos_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p17 = acos_tx,acos_coeff_P18,acos_coeff_P17 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p8 = acos_t,acos_coeff_P9,acos_coeff_P8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fms.s1 acos_35by8 = acos_5by2,acos_11by4,acos_5by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_63by8 = acos_5by2,acos_11by4,f1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p13 = acos_t,acos_coeff_P14,acos_coeff_P13 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_18by4 = acos_3by2,acos_5by2,acos_3by4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_l1 = acos_5by2,acos_d,acos_3by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_d2 = acos_d,acos_d,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p15 = acos_t2,acos_poly_p17,acos_poly_p15 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_T0 = acos_d,acos_S0,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p19 = acos_t2,acos_coeff_P21,acos_poly_p19 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p4 = acos_t2,acos_poly_p6,acos_poly_p4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_d1 = acos_35by8,acos_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_231by16 = acos_3by2,acos_35by8,acos_63by8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p2 = acos_t,acos_coeff_P3,acos_coeff_P2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p8 = acos_t2,acos_coeff_P10,acos_poly_p8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p11 = acos_t,acos_coeff_P12,acos_coeff_P11 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_e0 = acos_d2,acos_l1,acos_d + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p15 = acos_tx,acos_coeff_P16,acos_coeff_P15 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p0 = acos_t,acos_coeff_P1,f1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p19 = acos_tx,acos_coeff_P20,acos_coeff_P19 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p4 = acos_tx,acos_coeff_P5,acos_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p8 = acos_tx,acos_coeff_P9,acos_coeff_P8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_l2 = acos_231by16,acos_d,acos_63by8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_d3 = acos_d2,acos_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_T3 = acos_d2,acos_T0,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_429by16 = acos_18by4,acos_11by4,acos_231by16 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_S1 = acos_e0,acos_S0,acos_S0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p4 = acos_t4,acos_poly_p8,acos_poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p15 = acos_t4,acos_poly_p19,acos_poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p0 = acos_t2,acos_poly_p2,acos_poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p11 = acos_t2,acos_poly_p13,acos_poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_t8 = acos_t4,acos_t4,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_e1 = acos_d2,acos_l2,acos_d1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p4 = acos_tx2,acos_1poly_p6,acos_1poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p15 = acos_tx2,acos_1poly_p17,acos_1poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p8 = acos_tx2,acos_coeff_P10,acos_1poly_p8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p19 = acos_tx2,acos_coeff_P21,acos_1poly_p19 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p2 = acos_tx,acos_coeff_P3,acos_coeff_P2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p13 = acos_tx,acos_coeff_P14,acos_coeff_P13 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p0 = acos_tx,acos_coeff_P1,f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p11 = acos_tx,acos_coeff_P12,acos_coeff_P11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_l3 = acos_429by16,acos_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_z = acos_e1,acos_T3,acos_S1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_poly_p11 = acos_t4,acos_poly_p15,acos_poly_p11 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_T6 = acos_T3,acos_d3,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_t11 = acos_t8,acos_t3,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_poly_p0 = acos_t4,acos_poly_p4,acos_poly_p0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p4 = acos_tx4,acos_1poly_p8,acos_1poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p15 = acos_tx4,acos_1poly_p19,acos_1poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p0 = acos_tx2,acos_1poly_p2,acos_1poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p11 = acos_tx2,acos_1poly_p13,acos_1poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +// fcmp.le.s1 acos_pred_LEsqrt2by2,acos_pred_GTsqrt2by2 = acos_abs_x,acos_const_sqrt2by2 + fcmp.le.s1 p7,p8 = acos_abs_x,acos_const_sqrt2by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_tx8 = acos_tx4,acos_tx4,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_z = acos_l3,acos_T6,acos_z + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acos_series_t = acos_t11,acos_poly_p11,acos_poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fma.s1 acos_const_add = acos_const_piby2, f1, acos_const_piby2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p12) fma.s1 acos_const_add = f1,f0,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p0 = acos_tx4,acos_1poly_p4,acos_1poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acos_1poly_p11 = acos_tx4,acos_1poly_p15,acos_1poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acos_tx11 = acos_tx8,acos_tx3,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +//(acos_pred_GTsqrt2by2) fnma.s1 answer2 = acos_z,acos_series_t,acos_const_piby2 +(p8) fnma.s1 answer2 = acos_z,acos_series_t,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acos_series_tx = acos_tx11,acos_1poly_p11,acos_1poly_p0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +//(acos_pred_GTsqrt2by2) fnma.d f8 = acos_sgn_x,answer2,acos_const_piby2 +(p8) fnma.d f8 = acos_sgn_x,answer2,acos_const_add + nop.i 999;; +} + +{ .mfb + nop.m 999 +//(acos_pred_LEsqrt2by2) fnma.d f8 = f8,acos_series_tx,acos_const_piby2 +(p7) fnma.d f8 = f8,acos_series_tx,acos_const_piby2 + br.ret.sptk b0 ;; +} + + +L(ACOS_ZERO): +// Here if x=0 +{ .mfb + nop.m 999 + fma.d f8 = acos_const_piby2,f1,f0 + br.ret.sptk b0 ;; +} + + +L(ACOS_ABS_ONE): +.pred.rel "mutex",p11,p12 +// Here if |x|=1 +{ .mfi + nop.m 999 +(p11) fma.d f8 = acos_const_piby2,f1,acos_const_piby2 // acos(-1)=pi + nop.i 999 +} +{ .mfb + nop.m 999 +(p12) fma.d f8 = f1,f0,f0 // acos(1)=0 + br.ret.sptk b0 ;; +} + + +.endp acos +ASM_SIZE_DIRECTIVE(acos) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body + frcpa.s0 f9,p0 = f0,f0 +;; + +{ .mib + stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack + adds r32 = 48,sp + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + ldfd f8 = [r32] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return + +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support,@function +.global __libm_error_support diff --git a/sysdeps/ia64/fpu/e_acosf.S b/sysdeps/ia64/fpu/e_acosf.S new file mode 100644 index 0000000000..5df3afcd25 --- /dev/null +++ b/sysdeps/ia64/fpu/e_acosf.S @@ -0,0 +1,693 @@ +.file "acosf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 2/02/00 Initial revision +// 6/28/00 Improved speed +// 6/31/00 Changed register allocation because of some duplicate macros +// moved nan exit bundle up to gain a cycle. +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. + + +// Description +//========================================= +// The acosf function computes the principle value of the arc sine of x. +// A doman error occurs for arguments not in the range [-1,+1]. + +// The acosf function returns the arc cosine in the range [0, +pi] radians. +// acos(1) returns +0 +// acos(x) returns a Nan and raises the invalid exception for |x| >1 + +// |x| <= sqrt(2)/2. get Ax and Bx + +// poly_p1 = x p1 +// poly_p3 = x2 p4 + p3 +// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x +// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2 + +// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x +// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x + +// poly_p7 = x2 p8 + p7 +// poly_p5 = x2 p6 + p5 + +// poly_p7 = x4 p9 + (x2 p8 + p7) +// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5 + +// sinf1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x +// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x +// answer1 = pi/2 - sinf1 + + + +// |x| > sqrt(2)/2 + +// Get z = sqrt(1-x2) + +// Get polynomial in t = 1-x2 + +// t2 = t t +// t4 = t2 t2 + +// poly_p4 = t p5 + p4 +// poly_p1 = t p1 + 1 + +// poly_p6 = t p7 + p6 +// poly_p2 = t p3 + p2 + +// poly_p8 = t p9 + p8 + +// poly_p4 = t2 poly_p6 + poly_p4 +// = t2 (t p7 + p6) + (t p5 + p4) + +// poly_p2 = t2 poly_p2 + poly_p1 +// = t2 (t p3 + p2) + (t p1 + 1) + +// poly_p4 = t4 poly_p8 + poly_p4 +// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)) + +// P(t) = poly_p2 + t4 poly_p8 +// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))) +// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4 + + +// answer2 = sign(x) z P(t) if x>0 +// = sign(x) z P(t) + pi if x<0 + +#include "libm_support.h" + +// +// Assembly macros +//========================================= + +// predicate registers +//acosf_pred_LEsqrt2by2 = p7 +//acosf_pred_GTsqrt2by2 = p8 + +// integer registers +ACOSF_Addr1 = r33 +ACOSF_Addr2 = r34 +ACOSF_GR_1by2 = r35 + +ACOSF_GR_3by2 = r36 +ACOSF_GR_5by2 = r37 + +GR_SAVE_B0 = r38 +GR_SAVE_PFS = r39 +GR_SAVE_GP = r40 + +GR_Parameter_X = r41 +GR_Parameter_Y = r42 +GR_Parameter_RESULT = r43 +GR_Parameter_TAG = r44 + +// floating point registers + +acosf_y = f32 +acosf_abs_x = f33 +acosf_x2 = f34 +acosf_sgn_x = f35 + +acosf_1by2 = f36 +acosf_3by2 = f37 +acosf_5by2 = f38 +acosf_coeff_P3 = f39 +acosf_coeff_P8 = f40 + +acosf_coeff_P1 = f41 +acosf_coeff_P4 = f42 +acosf_coeff_P5 = f43 +acosf_coeff_P2 = f44 +acosf_coeff_P7 = f45 + +acosf_coeff_P6 = f46 +acosf_coeff_P9 = f47 +acosf_x2 = f48 +acosf_x3 = f49 +acosf_x4 = f50 + +acosf_x8 = f51 +acosf_x5 = f52 +acosf_const_piby2 = f53 +acosf_const_sqrt2by2 = f54 +acosf_x11 = f55 + +acosf_poly_p1 = f56 +acosf_poly_p3 = f57 +acosf_sinf1 = f58 +acosf_poly_p2 = f59 +acosf_poly_Ax = f60 + +acosf_poly_p7 = f61 +acosf_poly_p5 = f62 +acosf_sgnx_t4 = f63 +acosf_poly_Bx = f64 +acosf_t = f65 + +acosf_yby2 = f66 +acosf_B = f67 +acosf_B2 = f68 +acosf_Az = f69 +acosf_dz = f70 + +acosf_Sz = f71 +acosf_d2z = f72 +acosf_Fz = f73 +acosf_z = f74 +acosf_sgnx_z = f75 + +acosf_t2 = f76 +acosf_2poly_p4 = f77 +acosf_2poly_p6 = f78 +acosf_2poly_p1 = f79 +acosf_2poly_p2 = f80 + +acosf_2poly_p8 = f81 +acosf_t4 = f82 +acosf_Pt = f83 +acosf_sgnx_2poly_p2 = f84 +acosf_sgn_x_piby2 = f85 + +acosf_poly_p7a = f86 +acosf_2poly_p4a = f87 +acosf_2poly_p4b = f88 +acosf_2poly_p2a = f89 +acosf_poly_p1a = f90 + + + + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +acosf_coeff_1_table: +ASM_TYPE_DIRECTIVE(acosf_coeff_1_table,@object) +data8 0x3FC5555607DCF816 // P1 +data8 0x3F9CF81AD9BAB2C6 // P4 +data8 0x3FC59E0975074DF3 // P7 +data8 0xBFA6F4CC2780AA1D // P6 +data8 0x3FC2DD45292E93CB // P9 +data8 0x3fe6a09e667f3bcd // sqrt(2)/2 +ASM_SIZE_DIRECTIVE(acosf_coeff_1_table) + +acosf_coeff_2_table: +ASM_TYPE_DIRECTIVE(acosf_coeff_2_table,@object) +data8 0x3FA6F108E31EFBA6 // P3 +data8 0xBFCA31BF175D82A0 // P8 +data8 0x3FA30C0337F6418B // P5 +data8 0x3FB332C9266CB1F9 // P2 +data8 0x3ff921fb54442d18 // pi_by_2 +ASM_SIZE_DIRECTIVE(acosf_coeff_2_table) + +.align 32 +.global acosf +ASM_TYPE_DIRECTIVE(acosf,@function) + +.section .text +.proc acosf +.align 32 + +acosf: + +// Load the addresses of the two tables. +// Then, load the coefficients and other constants. + +{ .mfi + alloc r32 = ar.pfs,1,8,4,0 + fnma.s1 acosf_t = f8,f8,f1 + dep.z ACOSF_GR_1by2 = 0x3f,24,8 // 0x3f000000 +} +{ .mfi + addl ACOSF_Addr1 = @ltoff(acosf_coeff_1_table),gp + fma.s1 acosf_x2 = f8,f8,f0 + addl ACOSF_Addr2 = @ltoff(acosf_coeff_2_table),gp ;; +} + + +{ .mfi + ld8 ACOSF_Addr1 = [ACOSF_Addr1] + fmerge.s acosf_abs_x = f1,f8 + dep ACOSF_GR_3by2 = -1,r0,22,8 // 0x3fc00000 +} +{ .mlx + nop.m 999 + movl ACOSF_GR_5by2 = 0x40200000;; +} + + + +{ .mfi + setf.s acosf_1by2 = ACOSF_GR_1by2 + fmerge.s acosf_sgn_x = f8,f1 + nop.i 999 +} +{ .mfi + ld8 ACOSF_Addr2 = [ACOSF_Addr2] + nop.f 0 + nop.i 999;; +} + + +{ .mfi + setf.s acosf_5by2 = ACOSF_GR_5by2 + fcmp.lt.s1 p11,p12 = f8,f0 + nop.i 999;; +} + +{ .mmf + ldfpd acosf_coeff_P1,acosf_coeff_P4 = [ACOSF_Addr1],16 + setf.s acosf_3by2 = ACOSF_GR_3by2 + fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan +} + + +{ .mfi + ldfpd acosf_coeff_P7,acosf_coeff_P6 = [ACOSF_Addr1],16 + fma.s1 acosf_t2 = acosf_t,acosf_t,f0 + nop.i 999 +} +{ .mfi + ldfpd acosf_coeff_P3,acosf_coeff_P8 = [ACOSF_Addr2],16 + fma.s1 acosf_x4 = acosf_x2,acosf_x2,f0 + nop.i 999;; +} + + +{ .mfi + ldfpd acosf_coeff_P9,acosf_const_sqrt2by2 = [ACOSF_Addr1] + fclass.m.unc p10,p0 = f8, 0x07 //@zero + nop.i 999 +} +{ .mfi + ldfpd acosf_coeff_P5,acosf_coeff_P2 = [ACOSF_Addr2],16 + fma.s1 acosf_x3 = f8,acosf_x2,f0 + nop.i 999;; +} + + +{ .mfi + ldfd acosf_const_piby2 = [ACOSF_Addr2] + frsqrta.s1 acosf_B,p0 = acosf_t + nop.i 999 +} +{ .mfb + nop.m 999 +(p8) fma.s f8 = f8,f1,f0 +(p8) br.ret.spnt b0 ;; // Exit if x=nan +} + + +{ .mfb + nop.m 999 + fcmp.eq.s1 p6,p0 = acosf_abs_x,f1 +(p10) br.cond.spnt L(ACOSF_ZERO) ;; // Branch if x=0 +} + +{ .mfi + nop.m 999 + fcmp.gt.s1 p9,p0 = acosf_abs_x,f1 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acosf_x8 = acosf_x4,acosf_x4,f0 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0 +(p6) br.cond.spnt L(ACOSF_ABS_ONE) ;; // Branch if |x|=1 +} + +{ .mfi + nop.m 999 + fma.s1 acosf_x5 = acosf_x2,acosf_x3,f0 + nop.i 999 +} +{ .mfb +(p9) mov GR_Parameter_TAG = 59 + fma.s1 acosf_yby2 = acosf_t,acosf_1by2,f0 +(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1 +} + + +{ .mfi + nop.m 999 + fma.s1 acosf_Az = acosf_t,acosf_B,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_B2 = acosf_B,acosf_B,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p1 = f8,acosf_coeff_P1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p1 = acosf_coeff_P1,acosf_t,f1 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p3 = acosf_coeff_P4,acosf_x2,acosf_coeff_P3 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p6 = acosf_coeff_P7,acosf_t,acosf_coeff_P6 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p7 = acosf_x2,acosf_coeff_P8,acosf_coeff_P7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p2 = acosf_coeff_P3,acosf_t,acosf_coeff_P2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p5 = acosf_x2,acosf_coeff_P6,acosf_coeff_P5 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p4 = acosf_coeff_P5,acosf_t,acosf_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acosf_x11 = acosf_x8,acosf_x3,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 acosf_dz = acosf_B2,acosf_yby2,acosf_1by2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p1a = acosf_x2,acosf_poly_p1,f8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p8 = acosf_coeff_P9,acosf_t,acosf_coeff_P8 + nop.i 999;; +} + + +// Get the absolute value of x and determine the region in which x lies + +{ .mfi + nop.m 999 + fcmp.le.s1 p7,p8 = acosf_abs_x,acosf_const_sqrt2by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p2 = acosf_x2,acosf_poly_p3,acosf_coeff_P2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 acosf_poly_p7a = acosf_x4,acosf_coeff_P9,acosf_poly_p7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 acosf_2poly_p2a = acosf_2poly_p2,acosf_t2,acosf_2poly_p1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_sgnx_t4 = acosf_sgn_x,acosf_t4,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_2poly_p4a = acosf_2poly_p6,acosf_t2,acosf_2poly_p4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_Sz = acosf_5by2,acosf_dz,acosf_3by2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_d2z = acosf_dz,acosf_dz,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fnma.d.s1 acosf_sgn_x_piby2 = acosf_sgn_x,acosf_const_piby2,acosf_const_piby2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 acosf_poly_Ax = acosf_x5,acosf_poly_p2,acosf_poly_p1a + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 acosf_poly_Bx = acosf_x4,acosf_poly_p7a,acosf_poly_p5 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_sgnx_2poly_p2 = acosf_sgn_x,acosf_2poly_p2a,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_2poly_p4b = acosf_2poly_p8,acosf_t4,acosf_2poly_p4a + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 acosf_Fz = acosf_d2z,acosf_Sz,acosf_dz + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.d.s1 acosf_Pt = acosf_2poly_p4b,acosf_sgnx_t4,acosf_sgnx_2poly_p2 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p8) fma.d.s1 acosf_z = acosf_Az,acosf_Fz,acosf_Az + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.d.s1 acosf_sinf1 = acosf_x11,acosf_poly_Bx,acosf_poly_Ax + nop.i 999;; +} + +.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2 +{ .mfi + nop.m 999 +(p8) fma.s f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p7) fms.s f8 = acosf_const_piby2,f1,acosf_sinf1 + br.ret.sptk b0 ;; +} + +L(ACOSF_ZERO): +// Here if x=0 +{ .mfb + nop.m 999 + fma.s f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2 + br.ret.sptk b0 ;; +} + + +L(ACOSF_ABS_ONE): +.pred.rel "mutex",p11,p12 +// Here if |x|=1 +{ .mfi + nop.m 999 +(p11) fma.s f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi + nop.i 999 +} +{ .mfb + nop.m 999 +(p12) fma.s f8 = f1,f0,f0 // acosf(1)=0 + br.ret.sptk b0 ;; +} + +.endp acosf +ASM_SIZE_DIRECTIVE(acosf) + + +// Stack operations when calling error support. +// (1) (2) +// sp -> + psp -> + +// | | +// | | <- GR_Y +// | | +// | <-GR_Y Y2->| +// | | +// | | <- GR_X +// | | +// sp-64 -> + sp -> + +// save ar.pfs save b0 +// save gp + + +// Stack operations when calling error support. +// (3) (call) (4) +// psp -> + sp -> + +// | | +// R3 ->| <- GR_RESULT | -> f8 +// | | +// Y2 ->| <- GR_Y | +// | | +// X1 ->| | +// | | +// sp -> + + +// restore gp +// restore ar.pfs + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +{ .mfi + nop.m 0 + frcpa.s0 f9,p0 = f0,f0 + nop.i 0 +};; + +{ .mib + stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_acosl.S b/sysdeps/ia64/fpu/e_acosl.S new file mode 100644 index 0000000000..81f56e41c8 --- /dev/null +++ b/sysdeps/ia64/fpu/e_acosl.S @@ -0,0 +1,1094 @@ +.file "acosl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/07/00 Modified calculation of acos_corr to correct acosl +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 12/20/00 Set denormal flag properly. +// +// API +//============================================================== +// double-extended = acosl (double-extended) +// input floating point f8 +// output floating point f8 +// +// Registers used +//============================================================== +// +// predicate registers used: +// p6 -> p12 +// +// floating-point registers used: +// f8 has input, then output +// f8 -> f15, f32 ->f99 +// +// general registers used: +// r32 -> r48 +// +// Overview of operation +//============================================================== +// There are three paths +// 1. |x| < 2^-25 ACOS_TINY +// 2. 2^-25 <= |x| < 1/4 ACOS_POLY +// 3. 1/4 <= |x| < 1 ACOS_ATAN + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +// f8 is input, but acos_V must be put in f8 +// when __libm_atan2_reg is called, f8 must get V +// f9 gets U when __libm_atan2_reg is called + + +// __libm_atan2_reg returns +// f8 = Z_hi +// f10 = Z_lo +// f11 = s_lo + +acos_Z_hi = f8 +acos_Z_lo = f10 +acos_S_lo = f11 + +// When we call __libm_atan2_reg, we must save +// the following: + +acos_corr = f12 +acos_X = f13 +acos_pi_hi = f14 +acos_pi_lo = f15 + +// The rest of the assembly macros + +acos_P79 = f32 +acos_P59 = f33 +acos_P39 = f34 +acos_P19 = f35 + +acos_P810 = f36 +acos_P610 = f37 +acos_P410 = f38 +acos_P210 = f39 + +acos_A1 = f41 +acos_A2 = f42 +acos_A3 = f43 +acos_A4 = f44 +acos_A5 = f45 +acos_A6 = f46 +acos_A7 = f47 +acos_A8 = f48 +acos_A9 = f49 +acos_A10 = f50 + +acos_X2 = f51 +acos_X4 = f52 + +acos_B = f53 +acos_Bb = f54 +acos_A = f55 +acos_Aa = f56 + +acos_1mA = f57 + +acos_W = f58 +acos_Ww = f59 + +acos_y0 = f60 +acos_y1 = f61 +acos_y2 = f62 + +acos_H = f63 +acos_Hh = f64 + +acos_t1 = f65 +acos_t2 = f66 +acos_t3 = f67 +acos_t4 = f68 +acos_t5 = f69 + +acos_Pseries = f70 +acos_NORM_f8 = f71 +acos_ABS_NORM_f8 = f72 + +acos_2 = f73 +acos_P1P2 = f74 +acos_HALF = f75 +acos_U = f76 + +acos_1mB = f77 +acos_V = f78 +acos_S = f79 + +acos_BmUU = f80 +acos_BmUUpb = f81 +acos_2U = f82 +acos_1d2U = f83 + +acos_Dd = f84 + +acos_pi_by_2_hi = f85 +acos_pi_by_2_lo = f86 +acos_xmpi_by_2_lo = f87 +acos_xPmw = f88 + +acos_Uu = f89 +acos_AmVV = f90 +acos_AmVVpa = f91 + +acos_2V = f92 +acos_1d2V = f93 +acos_Vv = f94 + +acos_Vu = f95 +acos_Uv = f96 + +acos_2_Z_hi = f97 +acos_s_lo_Z_lo = f98 +acos_result_lo = f99 + +acos_Z_hi = f8 +acos_Z_lo = f10 +acos_s_lo = f11 + +acos_GR_17_ones = r33 +acos_GR_16_ones = r34 +acos_GR_signexp_f8 = r35 +acos_GR_exp = r36 +acos_GR_true_exp = r37 +acos_GR_fffe = r38 + +GR_SAVE_PFS = r43 +GR_SAVE_B0 = r39 +GR_SAVE_GP = r41 + +// r40 is address of table of coefficients +// r42 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 +GR_Parameter_TAG = r47 + + +// 2^-40: +// A true exponent of -40 is +// : -40 + register_bias +// : -28 + ffff = ffd7 + +// A true exponent of 1 is +// : 1 + register_bias +// : 1 + ffff = 10000 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +acos_coefficients: +ASM_TYPE_DIRECTIVE(acos_coefficients,@object) +data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi +data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo +data8 0xc90fdaa22168c234, 0x00004000 // pi_hi +data8 0xc4c6628b80dc1cd1, 0x00003FC0 // pi_lo + +data8 0xBB08911F2013961E, 0x00003FF8 // A10 +data8 0x981F1095A23A87D3, 0x00003FF8 // A9 +data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8 +data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7 +data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6 +data8 0xB745D09B2B0E850B, 0x00003FF9 // A5 +data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4 +data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3 +data8 0x99999999999AF376, 0x00003FFB // A2 +data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1 +ASM_SIZE_DIRECTIVE(acos_coefficients) + + +.align 32 +.global acosl# +ASM_TYPE_DIRECTIVE(acosl#,@function) + +.section .text +.proc acosl# +.align 32 + + +acosl: + +// After normalizing f8, get its true exponent +{ .mfi + alloc r32 = ar.pfs,1,11,4,0 +(p0) fnorm.s1 acos_NORM_f8 = f8 +(p0) mov acos_GR_17_ones = 0x1ffff +} + +{ .mmi +(p0) mov acos_GR_16_ones = 0xffff +(p0) addl r40 = @ltoff(acos_coefficients), gp + nop.i 999 +} +;; + +// Set denormal flag on denormal input with fcmp +{ .mfi + ld8 r40 = [r40] + fcmp.eq p6,p0 = f8,f0 + nop.i 999 +} +;; + + +// Load the constants pi_by_2 and pi. +// Each is stored as hi and lo values +// Also load the coefficients for ACOS_POLY + +{ .mmi +(p0) ldfe acos_pi_by_2_hi = [r40],16 ;; +(p0) ldfe acos_pi_by_2_lo = [r40],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe acos_pi_hi = [r40],16 ;; +(p0) ldfe acos_pi_lo = [r40],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe acos_A10 = [r40],16 ;; +(p0) ldfe acos_A9 = [r40],16 + nop.i 999 ;; +} + +// Take the absolute value of f8 +{ .mmf + nop.m 999 +(p0) getf.exp acos_GR_signexp_f8 = acos_NORM_f8 +(p0) fmerge.s acos_ABS_NORM_f8 = f0, acos_NORM_f8 +} + +{ .mii +(p0) ldfe acos_A8 = [r40],16 + nop.i 999 ;; +(p0) and acos_GR_exp = acos_GR_signexp_f8, acos_GR_17_ones ;; +} + +// case 1: |x| < 2^-25 ==> p6 ACOS_TINY +// case 2: 2^-25 <= |x| < 2^-2 ==> p8 ACOS_POLY +// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN +// case 4: 1 <= |x| ==> p11 ACOS_ERROR_RETURN +// Admittedly |x| = 1 is not an error but this is where that case is +// handled. + +{ .mii +(p0) ldfe acos_A7 = [r40],16 +(p0) sub acos_GR_true_exp = acos_GR_exp, acos_GR_16_ones ;; +(p0) cmp.ge.unc p6, p7 = -26, acos_GR_true_exp ;; +} + +{ .mii +(p0) ldfe acos_A6 = [r40],16 +(p7) cmp.ge.unc p8, p9 = -3, acos_GR_true_exp ;; +(p9) cmp.ge.unc p10, p11 = -1, acos_GR_true_exp +} + +{ .mmi +(p0) ldfe acos_A5 = [r40],16 ;; +(p0) ldfe acos_A4 = [r40],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe acos_A3 = [r40],16 ;; +(p0) ldfe acos_A2 = [r40],16 + nop.i 999 ;; +} + +// ACOS_ERROR_RETURN ==> p11 is true +// case 4: |x| >= 1 +{ .mib +(p0) ldfe acos_A1 = [r40],16 + nop.i 999 +(p11) br.spnt L(ACOS_ERROR_RETURN) ;; +} + +// ACOS_TINY ==> p6 is true +// case 1: |x| < 2^-25 +{ .mfi + nop.m 999 +(p6) fms.s1 acos_xmpi_by_2_lo = acos_NORM_f8,f1, acos_pi_by_2_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fms.s0 f8 = acos_pi_by_2_hi,f1, acos_xmpi_by_2_lo +(p6) br.ret.spnt b0 ;; +} + + + +// ACOS_POLY ==> p8 is true +// case 2: 2^-25 <= |x| < 2^-2 +{ .mfi + nop.m 999 +(p8) fms.s1 acos_W = acos_pi_by_2_hi, f1, acos_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_X2 = f8,f8, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fms.s1 acos_Ww = acos_pi_by_2_hi, f1, acos_W + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_X4 = acos_X2,acos_X2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fms.s1 acos_Ww = acos_Ww, f1, acos_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P810 = acos_X4, acos_A10, acos_A8 + nop.i 999 +} + +// acos_P79 = X4*A9 + A7 +// acos_P810 = X4*A10 + A8 +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P79 = acos_X4, acos_A9, acos_A7 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_Ww = acos_Ww, f1, acos_pi_by_2_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P610 = acos_X4, acos_P810, acos_A6 + nop.i 999 +} + + +// acos_P59 = X4*(X4*A9 + A7) + A5 +// acos_P610 = X4*(X4*A10 + A8) + A6 +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P59 = acos_X4, acos_P79, acos_A5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P410 = acos_X4, acos_P610, acos_A4 + nop.i 999 +} + +// acos_P39 = X4*(X4*(X4*A9 + A7) + A5) + A3 +// acos_P410 = X4*(X4*(X4*A10 + A8) + A6) + A4 +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P39 = acos_X4, acos_P59, acos_A3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P210 = acos_X4, acos_P410, acos_A2 + nop.i 999 +} + +// acos_P19 = X4*(X4*(X4*(X4*A9 + A7) + A5) + A3) + A1 = P1 +// acos_P210 = X4*(X4*(X4*(X4*A10 + A8) + A6) + A4) + A2 = P2 +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P19 = acos_X4, acos_P39, acos_A1 + nop.i 999 ;; +} + +// acos_P1P2 = Xsq*P2 + P1 +// acos_P1P2 = Xsq*(Xsq*P2 + P1) +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P1P2 = acos_X2, acos_P210, acos_P19 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 acos_P1P2 = acos_X2, acos_P1P2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fms.s1 acos_xPmw = acos_NORM_f8, acos_P1P2, acos_Ww + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p8) fms.s0 f8 = acos_W, f1, acos_xPmw +(p8) br.ret.spnt b0 ;; +} + + +// ACOS_ATAN +// case 3: 2^-2 <= |x| < 1 +// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN + +// Step 1.1: Get A,B and a,b +// A + a = 1- |X| +// B + b = 1+ |X| +// Note also that we will use acos_corr (f13) +// and acos_W + +// Step 2 +// Call __libm_atan2_reg + + +{ .mfi +(p0) mov acos_GR_fffe = 0xfffe +(p0) fma.s1 acos_B = f1,f1, acos_ABS_NORM_f8 +(p0) mov GR_SAVE_B0 = b0 ;; +} + +{ .mmf +(p0) mov GR_SAVE_GP = gp + nop.m 999 +(p0) fms.s1 acos_A = f1,f1, acos_ABS_NORM_f8 +} + +{ .mfi +(p0) setf.exp acos_HALF = acos_GR_fffe + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 acos_1mB = f1,f1, acos_B + nop.i 999 ;; +} + +// We want atan2(V,U) +// so put V in f8 and U in f9 +// but save X in acos_X + +{ .mfi + nop.m 999 +(p0) fmerge.se acos_X = f8, f8 + nop.i 999 ;; +} + +// Step 1.2: +///////////////////////// +// Get U = sqrt(B) +///////////////////////// + +{ .mfi + nop.m 999 +(p0) frsqrta.s1 acos_y0,p8 = acos_B + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fms.s1 acos_1mA = f1,f1, acos_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Bb = acos_1mB,f1, acos_ABS_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Hh = acos_HALF, acos_B, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fms.s1 acos_Aa = acos_1mA,f1, acos_ABS_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0 + nop.i 999 +} + + +// Step 1.2: +///////////////////////// +// Get V = sqrt(A) +///////////////////////// +{ .mfi + nop.m 999 +(p0) frsqrta.s1 acos_y0,p8 = acos_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_S = acos_B, acos_y2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Hh = acos_HALF, acos_A, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_B + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_U = acos_Dd, acos_H, acos_S + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_2U = acos_U, f1, acos_U + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0 + nop.i 999 +} + + +// Step 1.3: +// sqrt(A + a) = V + v +// sqrt(B + b) = U + u + +///////////////////////// +// Get u +///////////////////////// + +// acos_BmUU = B - UU +// acos_BmUUpb = (B - UU) + b + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_BmUU = acos_U, acos_U, acos_B + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.se f9 = acos_U, acos_U + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF + nop.i 999 ;; +} + +// acos_1d2U = frcpa(2U) +{ .mfi + nop.m 999 +(p0) frcpa.s1 acos_1d2U,p9 = f1, acos_2U + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_BmUUpb = acos_BmUU, f1, acos_Bb + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// acos_Uu = ((B - UU) + b) * frcpa(2U) +(p0) fma.s1 acos_Uu = acos_BmUUpb, acos_1d2U, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_S = acos_A, acos_y2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_V = acos_Dd, acos_H, acos_S + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_2V = acos_V, f1, acos_V + nop.i 999 +} + +// Step 3 +///////////////////////// +// Calculate the correction, acos_corr +///////////////////////// +// acos_corr = U*v - (V*u) + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Vu = acos_V,acos_Uu, f0 + nop.i 999 ;; +} + +///////////////////////// +// Get v +///////////////////////// +// acos_AmVV = A - VV +// acos_AmVVpa = (A - VV) + a + +{ .mfi + nop.m 999 +(p0) fnma.s1 acos_AmVV = acos_V, acos_V, acos_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.se f8 = acos_V, acos_V + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_AmVVpa = acos_AmVV, f1, acos_Aa + nop.i 999 ;; +} + +// acos_1d2V = frcpa(2V) +{ .mfi + nop.m 999 +(p0) frcpa.s1 acos_1d2V,p9 = f1, acos_2V + nop.i 999 ;; +} + +// acos_Vv = ((A - VV) + a) * frcpa(2V) +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Vv = acos_AmVVpa, acos_1d2V, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_Uv = acos_U,acos_Vv, f0 + nop.i 999 ;; +} + + +.endp acosl# +ASM_SIZE_DIRECTIVE(acosl#) + + +.proc __libm_callout +__libm_callout: +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mfb + nop.m 999 +(p0) fms.s1 acos_corr = acos_Uv,f1, acos_Vu +(p0) br.call.sptk.many b0=__libm_atan2_reg# ;; +} + + +// p6 ==> X is negative +// p7 ==> x is positive +// We know that |X| >= 1/4 + +{ .mfi +(p0) mov gp = GR_SAVE_GP +(p0) fcmp.lt.unc p6,p7 = acos_X , f0 +(p0) mov b0 = GR_SAVE_B0 ;; +} + +// acos_2_Z_hi = 2 * acos_Z_hi +// acos_s_lo_Z_lo = s_lo * Z_lo + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_2_Z_hi = acos_Z_hi, f1, acos_Z_hi +(p0) mov ar.pfs = GR_SAVE_PFS +} + +{ .mfi + nop.m 999 +(p0) fma.s1 acos_s_lo_Z_lo = acos_s_lo, acos_Z_lo, f0 + nop.i 999 ;; +} + +// 2 is a constant needed later +{ .mfi + nop.m 999 +(p0) fma.s1 acos_2 = f1,f1,f1 + nop.i 999 ;; +} + +// X >= 1/4 +// acos_result_lo = 2(s_lo * Z_lo) - corr +// f8 = (2*Z_hi) + (2(s_lo * Z_lo) - corr) + +{ .mfi + nop.m 999 +(p7) fma.s1 acos_result_lo = acos_s_lo_Z_lo, acos_2, acos_corr + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s0 f8 = acos_2_Z_hi, f1, acos_result_lo + nop.i 999 +} + +// acos_result_lo = (pi_lo - corr) +// acos_result_lo = (pi_lo - corr) + acos_Ww +{ .mfi + nop.m 999 +(p6) fms.s1 acos_result_lo = acos_pi_lo, f1, acos_corr + nop.i 999 ;; +} + +// X <= -1/4 +// acos_W = pi_hi - 2 * Z_hi +{ .mfi + nop.m 999 +(p6) fnma.s1 acos_W = acos_2, acos_Z_hi, acos_pi_hi + nop.i 999 ;; +} + +// acos_Ww = pi_hi - W +// acos_Ww = (pi_hi - W) + (2 * Z_hi) +{ .mfi + nop.m 999 +(p6) fms.s1 acos_Ww = acos_pi_hi, f1, acos_W + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fms.s1 acos_Ww = acos_Ww, f1, acos_2_Z_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fma.s1 acos_result_lo = acos_result_lo, f1, acos_Ww + nop.i 999 ;; +} + +// acos_Z_lo = ((pi_lo - corr) + acos_Ww) - 2 * (s_lo * Z_lo) +{ .mfi + nop.m 999 +(p6) fnma.s1 acos_Z_lo = acos_s_lo_Z_lo, acos_2, acos_result_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fma.s0 f8 = acos_W, f1, acos_Z_lo +(p0) br.ret.sptk b0 ;; +} +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + +.proc SPECIAL +SPECIAL: +L(ACOS_NAN): +{ .mfb + nop.m 999 +(p0) fma.s0 f8 = f8,f1,f0 +(p0) br.ret.sptk b0 ;; +} + +L(ACOS_ERROR_RETURN): +// Save ar.pfs, b0, and gp; restore on exit + +// qnan snan inf norm unorm 0 -+ +// 1 1 0 0 0 0 11 = 0xc3 + +// Coming in as X = +- 1 +// What should we return? + +// If X is 1, return (sign of X)pi/2 + + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc p6,p7 = acos_ABS_NORM_f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fcmp.lt.unc p8,p9 = f8,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s0 f8 = acos_pi_hi, f1, acos_pi_lo + nop.i 999 +} + +{ .mfb + nop.m 999 +(p9) fmerge.s f8 = f8,f0 +(p6) br.ret.spnt b0 ;; +} + +// If X is a NAN, leave +{ .mfi + nop.m 999 +(p0) fclass.m.unc p12,p0 = f8, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p12) fma.s0 f8 = f8,f1,f0 +(p12) br.ret.spnt b0 ;; +} + +{ .mfi +(p0) mov GR_Parameter_TAG = 57 +(p0) frcpa f10, p6 = f0, f0 +nop.i 999 +};; + +.endp SPECIAL +ASM_SIZE_DIRECTIVE(SPECIAL) + +.proc __libm_error_region +__libm_error_region: +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfe [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = f10 // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; + +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# + +.type __libm_atan2_reg#,@function +.global __libm_atan2_reg# diff --git a/sysdeps/ia64/fpu/e_asin.S b/sysdeps/ia64/fpu/e_asin.S new file mode 100644 index 0000000000..cd19fce407 --- /dev/null +++ b/sysdeps/ia64/fpu/e_asin.S @@ -0,0 +1,884 @@ +.file "asin.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 2/02/00 Initial version +// 8/17/00 New and much faster algorithm. +// 8/31/00 Avoided bank conflicts on loads, shortened |x|=1 path, +// fixed mfb split issue stalls. +// 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow. + +// Description +//========================================= +// The asin function computes the principle value of the arc sine of x. +// asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2. +// A doman error occurs for arguments not in the range [-1,+1]. + +// The asin function returns the arc sine in the range [-pi/2, +pi/2] radians. + +#include "libm_support.h" + +// +// Assembly macros +//========================================= + + +// predicate registers +//asin_pred_LEsqrt2by2 = p7 +//asin_pred_GTsqrt2by2 = p8 + +// integer registers +ASIN_Addr1 = r33 +ASIN_Addr2 = r34 +ASIN_FFFE = r35 +ASIN_lnorm_sig = r36 +ASIN_snorm_exp = r37 + +GR_SAVE_B0 = r36 +GR_SAVE_PFS = r37 +GR_SAVE_GP = r38 + +GR_Parameter_X = r39 +GR_Parameter_Y = r40 +GR_Parameter_RESULT = r41 +GR_Parameter_Tag = r42 + +// floating point registers +asin_coeff_P1 = f32 +asin_coeff_P2 = f33 +asin_coeff_P3 = f34 +asin_coeff_P4 = f35 + +asin_coeff_P5 = f36 +asin_coeff_P6 = f37 +asin_coeff_P7 = f38 +asin_coeff_P8 = f39 +asin_coeff_P9 = f40 + +asin_coeff_P10 = f41 +asin_coeff_P11 = f42 +asin_coeff_P12 = f43 +asin_coeff_P13 = f44 +asin_coeff_P14 = f45 + +asin_coeff_P15 = f46 +asin_coeff_P16 = f47 +asin_coeff_P17 = f48 +asin_coeff_P18 = f49 +asin_coeff_P19 = f50 + +asin_coeff_P20 = f51 +asin_coeff_P21 = f52 +asin_const_sqrt2by2 = f53 +asin_const_piby2 = f54 +asin_abs_x = f55 + +asin_tx = f56 +asin_tx2 = f57 +asin_tx3 = f58 +asin_tx4 = f59 +asin_tx8 = f60 + +asin_tx11 = f61 +asin_1poly_p8 = f62 +asin_1poly_p19 = f63 +asin_1poly_p4 = f64 +asin_1poly_p15 = f65 + +asin_1poly_p6 = f66 +asin_1poly_p17 = f67 +asin_1poly_p0 = f68 +asin_1poly_p11 = f69 +asin_1poly_p2 = f70 + +asin_1poly_p13 = f71 +asin_series_tx = f72 +asin_t = f73 +asin_t2 = f74 +asin_t3 = f75 + +asin_t4 = f76 +asin_t8 = f77 +asin_t11 = f78 +asin_poly_p8 = f79 +asin_poly_p19 = f80 + +asin_poly_p4 = f81 +asin_poly_p15 = f82 +asin_poly_p6 = f83 +asin_poly_p17 = f84 +asin_poly_p0 = f85 + +asin_poly_p11 = f86 +asin_poly_p2 = f87 +asin_poly_p13 = f88 +asin_series_t = f89 +asin_1by2 = f90 + +asin_3by2 = f91 +asin_5by2 = f92 +asin_11by4 = f93 +asin_35by8 = f94 +asin_63by8 = f95 + +asin_231by16 = f96 +asin_y0 = f97 +asin_H0 = f98 +asin_S0 = f99 +asin_d = f100 + +asin_l1 = f101 +asin_d2 = f102 +asin_T0 = f103 +asin_d1 = f104 +asin_e0 = f105 + +asin_l2 = f106 +asin_d3 = f107 +asin_T3 = f108 +asin_S1 = f109 +asin_e1 = f110 + +asin_z = f111 +answer2 = f112 +asin_sgn_x = f113 +asin_429by16 = f114 +asin_18by4 = f115 + +asin_3by4 = f116 +asin_l3 = f117 +asin_T6 = f118 +asin_eps_exp = f119 +asin_eps_sig = f120 +asin_eps = f120 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +asin_coeff_1_table: +ASM_TYPE_DIRECTIVE(asin_coeff_1_table,@object) +data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7 +data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18 +data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5 +data8 0xF75E381A323D4D94 , 0x0000C002 //P16 +data8 0x8959C2629C1024C0 , 0x0000C002 //P20 +data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9 +data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3 +data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14 +data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12 +data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1 +data8 0xF534912FF3E7B76F , 0x00003FFF //P21 +data8 0xc90fdaa22168c235 , 0x00003fff // pi/2 +data8 0x0000000000000000 , 0x00000000 // pad to avoid data bank conflict +ASM_SIZE_DIRECTIVE(asin_coeff_1_table) + + +asin_coeff_2_table: +ASM_TYPE_DIRECTIVE(asin_coeff_2_table,@object) +data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6 +data8 0xB4F118A4B1015470 , 0x00004003 //P17 +data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4 +data8 0x80F50489AEF1CAC6 , 0x00004002 //P15 +data8 0x92728015172CFE1C , 0x00004003 //P19 +data8 0xBBC3D831D4595971 , 0x00003FF8 //P8 +data8 0x999999999952A5C3 , 0x00003FFB //P2 +data8 0x855576BE6F0975EC , 0x00003FFF //P13 +data8 0xF12420E778077D89 , 0x00003FFA //P11 +data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10 +data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2 +ASM_SIZE_DIRECTIVE(asin_coeff_2_table) + + + +.align 32 +.global asin + +.section .text +.proc asin +.align 32 + + +asin: + +{ .mfi + alloc r32 = ar.pfs,1,6,4,0 + fma.s1 asin_tx = f8,f8,f0 + addl ASIN_Addr2 = @ltoff(asin_coeff_2_table),gp +} +{ .mfi + mov ASIN_FFFE = 0xFFFE + fnma.s1 asin_t = f8,f8,f1 + addl ASIN_Addr1 = @ltoff(asin_coeff_1_table),gp +} +;; + + +{ .mfi + setf.exp asin_1by2 = ASIN_FFFE + fmerge.s asin_abs_x = f1,f8 + nop.i 999 ;; +} + +{ .mmf + ld8 ASIN_Addr1 = [ASIN_Addr1] + ld8 ASIN_Addr2 = [ASIN_Addr2] + fmerge.s asin_sgn_x = f8,f1 ;; +} + + +{ .mfi + ldfe asin_coeff_P7 = [ASIN_Addr1],16 + fma.s1 asin_tx2 = asin_tx,asin_tx,f0 + nop.i 999 +} +{ .mfi + ldfe asin_coeff_P6 = [ASIN_Addr2],16 + fma.s1 asin_t2 = asin_t,asin_t,f0 + nop.i 999;; +} + + +{ .mmf + ldfe asin_coeff_P18 = [ASIN_Addr1],16 + ldfe asin_coeff_P17 = [ASIN_Addr2],16 + fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan +} +;; + +{ .mmf + ldfe asin_coeff_P5 = [ASIN_Addr1],16 + ldfe asin_coeff_P4 = [ASIN_Addr2],16 + frsqrta.s1 asin_y0,p0 = asin_t +} +;; + +{ .mfi + ldfe asin_coeff_P16 = [ASIN_Addr1],16 + fcmp.gt.s1 p9,p0 = asin_abs_x,f1 + nop.i 999 +} +{ .mfb + ldfe asin_coeff_P15 = [ASIN_Addr2],16 +(p8) fma.d f8 = f8,f1,f0 +(p8) br.ret.spnt b0 +} +;; + + +{ .mmf + ldfe asin_coeff_P20 = [ASIN_Addr1],16 + ldfe asin_coeff_P19 = [ASIN_Addr2],16 + fclass.m.unc p8,p0 = f8, 0x07 //@zero +} +;; + + +{ .mfi + ldfe asin_coeff_P9 = [ASIN_Addr1],16 + fma.s1 asin_t4 = asin_t2,asin_t2,f0 +(p9) mov GR_Parameter_Tag = 61 +} +{ .mfi + ldfe asin_coeff_P8 = [ASIN_Addr2],16 + fma.s1 asin_3by2 = asin_1by2,f1,f1 + nop.i 999;; +} + + +{ .mfi + ldfe asin_coeff_P2 = [ASIN_Addr2],16 + fma.s1 asin_tx4 = asin_tx2,asin_tx2,f0 + nop.i 999 +} +{ .mfb + ldfe asin_coeff_P3 = [ASIN_Addr1],16 + fma.s1 asin_t3 = asin_t,asin_t2,f0 +(p8) br.ret.spnt b0 +} +;; + + +{ .mfi + ldfe asin_coeff_P13 = [ASIN_Addr2],16 + fma.s1 asin_H0 = asin_y0,asin_1by2,f0 + nop.i 999 +} +{ .mfb + ldfe asin_coeff_P14 = [ASIN_Addr1],16 + fma.s1 asin_S0 = asin_y0,asin_t,f0 +(p9) br.cond.spnt __libm_error_region +} +;; + + +{ .mfi + ldfe asin_coeff_P11 = [ASIN_Addr2],16 + fcmp.eq.s1 p6,p0 = asin_abs_x,f1 + nop.i 999 +} +{ .mfi + ldfe asin_coeff_P12 = [ASIN_Addr1],16 + fma.s1 asin_tx3 = asin_tx,asin_tx2,f0 + nop.i 999;; +} + + +{ .mfi + ldfe asin_coeff_P10 = [ASIN_Addr2],16 + fma.s1 asin_1poly_p6 = asin_tx,asin_coeff_P7,asin_coeff_P6 + nop.i 999 +} +{ .mfi + ldfe asin_coeff_P1 = [ASIN_Addr1],16 + fma.s1 asin_poly_p6 = asin_t,asin_coeff_P7,asin_coeff_P6 + nop.i 999;; +} + + +{ .mfi + ldfe asin_const_sqrt2by2 = [ASIN_Addr2],16 + fma.s1 asin_5by2 = asin_3by2,f1,f1 + nop.i 999 +} +{ .mfi + ldfe asin_coeff_P21 = [ASIN_Addr1],16 + fma.s1 asin_11by4 = asin_3by2,asin_3by2,asin_1by2 + nop.i 999;; +} + + +{ .mfi + ldfe asin_const_piby2 = [ASIN_Addr1],16 + fma.s1 asin_poly_p17 = asin_t,asin_coeff_P18,asin_coeff_P17 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 asin_3by4 = asin_3by2,asin_1by2,f0 +(p6) br.cond.spnt L(ASIN_ABS_1) // Branch to short exit if |x|=1 +} +;; + + +{ .mfi + addl ASIN_lnorm_sig = -0x1,r0 // Form significand 0xffffffffffffffff + fma.s1 asin_poly_p15 = asin_t,asin_coeff_P16,asin_coeff_P15 + nop.i 999 +} +{ .mfi + addl ASIN_snorm_exp = 0x0c001,r0 // Form small exponent + fnma.s1 asin_d = asin_S0,asin_H0,asin_1by2 + nop.i 999;; +} + + +// Form the exponent and significand of a small number +{ .mfi + setf.sig asin_eps_sig = ASIN_lnorm_sig + fma.s1 asin_poly_p19 = asin_t,asin_coeff_P20,asin_coeff_P19 + nop.i 999 +} +{ .mfi + setf.exp asin_eps_exp = ASIN_snorm_exp + fma.s1 asin_poly_p4 = asin_t,asin_coeff_P5,asin_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p17 = asin_tx,asin_coeff_P18,asin_coeff_P17 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p8 = asin_t,asin_coeff_P9,asin_coeff_P8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fms.s1 asin_35by8 = asin_5by2,asin_11by4,asin_5by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_63by8 = asin_5by2,asin_11by4,f1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p13 = asin_t,asin_coeff_P14,asin_coeff_P13 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_18by4 = asin_3by2,asin_5by2,asin_3by4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_l1 = asin_5by2,asin_d,asin_3by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_d2 = asin_d,asin_d,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p15 = asin_t2,asin_poly_p17,asin_poly_p15 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_T0 = asin_d,asin_S0,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p19 = asin_t2,asin_coeff_P21,asin_poly_p19 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p4 = asin_t2,asin_poly_p6,asin_poly_p4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_d1 = asin_35by8,asin_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_231by16 = asin_3by2,asin_35by8,asin_63by8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p2 = asin_t,asin_coeff_P3,asin_coeff_P2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p8 = asin_t2,asin_coeff_P10,asin_poly_p8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p11 = asin_t,asin_coeff_P12,asin_coeff_P11 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_e0 = asin_d2,asin_l1,asin_d + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p15 = asin_tx,asin_coeff_P16,asin_coeff_P15 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p0 = asin_t,asin_coeff_P1,f1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p19 = asin_tx,asin_coeff_P20,asin_coeff_P19 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p4 = asin_tx,asin_coeff_P5,asin_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p8 = asin_tx,asin_coeff_P9,asin_coeff_P8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_l2 = asin_231by16,asin_d,asin_63by8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_d3 = asin_d2,asin_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_T3 = asin_d2,asin_T0,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_429by16 = asin_18by4,asin_11by4,asin_231by16 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_S1 = asin_e0,asin_S0,asin_S0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p4 = asin_t4,asin_poly_p8,asin_poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p15 = asin_t4,asin_poly_p19,asin_poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p0 = asin_t2,asin_poly_p2,asin_poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p11 = asin_t2,asin_poly_p13,asin_poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_t8 = asin_t4,asin_t4,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_e1 = asin_d2,asin_l2,asin_d1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p4 = asin_tx2,asin_1poly_p6,asin_1poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p15 = asin_tx2,asin_1poly_p17,asin_1poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p8 = asin_tx2,asin_coeff_P10,asin_1poly_p8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p19 = asin_tx2,asin_coeff_P21,asin_1poly_p19 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p2 = asin_tx,asin_coeff_P3,asin_coeff_P2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p13 = asin_tx,asin_coeff_P14,asin_coeff_P13 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p0 = asin_tx,asin_coeff_P1,f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p11 = asin_tx,asin_coeff_P12,asin_coeff_P11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_l3 = asin_429by16,asin_d,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_z = asin_e1,asin_T3,asin_S1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_poly_p11 = asin_t4,asin_poly_p15,asin_poly_p11 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_T6 = asin_T3,asin_d3,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_t11 = asin_t8,asin_t3,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_poly_p0 = asin_t4,asin_poly_p4,asin_poly_p0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p4 = asin_tx4,asin_1poly_p8,asin_1poly_p4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p15 = asin_tx4,asin_1poly_p19,asin_1poly_p15 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p0 = asin_tx2,asin_1poly_p2,asin_1poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p11 = asin_tx2,asin_1poly_p13,asin_1poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +// fcmp.le.s1 asin_pred_LEsqrt2by2,asin_pred_GTsqrt2by2 = asin_abs_x,asin_const_sqrt2by2 + fcmp.le.s1 p7,p8 = asin_abs_x,asin_const_sqrt2by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_tx8 = asin_tx4,asin_tx4,f0 + nop.i 999;; +} + + +// Form a small number to force inexact flag for small args +{ .mfi + nop.m 999 + fmerge.se asin_eps = asin_eps_exp,asin_eps_sig + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_z = asin_l3,asin_T6,asin_z + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asin_series_t = asin_t11,asin_poly_p11,asin_poly_p0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p0 = asin_tx4,asin_1poly_p4,asin_1poly_p0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asin_1poly_p11 = asin_tx4,asin_1poly_p15,asin_1poly_p11 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asin_tx11 = asin_tx8,asin_tx3,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +//(asin_pred_GTsqrt2by2) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2 +(p8) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asin_series_tx = asin_tx11,asin_1poly_p11,asin_1poly_p0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +//(asin_pred_GTsqrt2by2) fma.d f8 = asin_sgn_x,answer2,f0 +(p8) fma.d f8 = asin_sgn_x,answer2,f0 + nop.i 999;; +} + +// asin_eps is added only to force inexact and possibly underflow flag +// in case asin_series_tx is zero +// +{ .mfi + nop.m 999 +(p7) fma.d asin_eps = f8,asin_series_tx,asin_eps + nop.i 999 +} +{ .mfb + nop.m 999 +//(asin_pred_LEsqrt2by2) fma.d f8 = f8,asin_series_tx,f0 +(p7) fma.d f8 = f8,asin_series_tx,f0 + br.ret.sptk b0 +} +;; + + +L(ASIN_ABS_1): +// Here for short exit if |x|=1 +{ .mfb + nop.m 999 + fma.d f8 = asin_sgn_x,asin_const_piby2,f0 + br.ret.sptk b0 +} +;; + + +.endp asin +ASM_SIZE_DIRECTIVE(asin) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body + frcpa.s0 f9,p0 = f0,f0 +;; + +{ .mib + stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack + adds r32 = 48,sp + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + ldfd f8 = [r32] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return + +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support,@function +.global __libm_error_support diff --git a/sysdeps/ia64/fpu/e_asinf.S b/sysdeps/ia64/fpu/e_asinf.S new file mode 100644 index 0000000000..011dc9ec1b --- /dev/null +++ b/sysdeps/ia64/fpu/e_asinf.S @@ -0,0 +1,674 @@ +.file "asinf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/02/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 2/02/00 Initial revision +// 6/28/00 Improved speed +// 6/31/00 Changed register allocation because of some duplicate macros +// moved nan exit bundle up to gain a cycle. +// 8/08/00 Improved speed by avoiding SIR flush. +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. + +// Description +//========================================= +// The asinf function computes the arc sine of x in the range [-pi,+pi]. +// A doman error occurs for arguments not in the range [-1,+1]. +// asinf(+-0) returns +-0 +// asinf(x) returns a Nan and raises the invalid exception for |x| >1 + +// The acosf function returns the arc cosine in the range [0, +pi] radians. +// A doman error occurs for arguments not in the range [-1,+1]. +// acosf(1) returns +0 +// acosf(x) returns a Nan and raises the invalid exception for |x| >1 + + +// |x| <= sqrt(2)/2. get Ax and Bx + +// poly_p1 = x p1 +// poly_p3 = x2 p4 + p3 +// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x +// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2 + +// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x +// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x + +// poly_p7 = x2 p8 + p7 +// poly_p5 = x2 p6 + p5 + +// poly_p7 = x4 p9 + (poly_p7) +// poly_p7 = x4 p9 + (x2 p8 + p7) +// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5 + +// answer1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x +// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x + + + +// |x| > sqrt(2)/2 + +// Get z = sqrt(1-x2) + +// Get polynomial in t = 1-x2 + +// t2 = t t +// t4 = t2 t2 + +// poly_p4 = t p5 + p4 +// poly_p1 = t p1 + 1 + +// poly_p6 = t p7 + p6 +// poly_p2 = t p3 + p2 + +// poly_p8 = t p9 + p8 + +// poly_p4 = t2 poly_p6 + poly_p4 +// = t2 (t p7 + p6) + (t p5 + p4) + +// poly_p2 = t2 poly_p2 + poly_p1 +// = t2 (t p3 + p2) + (t p1 + 1) + +// poly_p4 = t4 poly_p8 + poly_p4 +// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)) + +// P(t) = poly_p2 + t4 poly_p8 +// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))) +// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4 + + +// answer2 = - sign(x) z P(t) + (sign(x) pi/2) +// + +#include "libm_support.h" + +// Assembly macros +//========================================= + +// predicate registers +//asinf_pred_LEsqrt2by2 = p7 +//asinf_pred_GTsqrt2by2 = p8 + +// integer registers +ASINF_Addr1 = r33 +ASINF_Addr2 = r34 +ASINF_GR_1by2 = r35 + +ASINF_GR_3by2 = r36 +ASINF_GR_5by2 = r37 + +GR_SAVE_B0 = r38 +GR_SAVE_PFS = r39 +GR_SAVE_GP = r40 + +GR_Parameter_X = r41 +GR_Parameter_Y = r42 +GR_Parameter_RESULT = r43 +GR_Parameter_TAG = r44 + +// floating point registers + +asinf_y = f32 +asinf_abs_x = f33 +asinf_x2 = f34 +asinf_sgn_x = f35 + +asinf_1by2 = f36 +asinf_3by2 = f37 +asinf_5by2 = f38 +asinf_coeff_P3 = f39 +asinf_coeff_P8 = f40 + +asinf_coeff_P1 = f41 +asinf_coeff_P4 = f42 +asinf_coeff_P5 = f43 +asinf_coeff_P2 = f44 +asinf_coeff_P7 = f45 + +asinf_coeff_P6 = f46 +asinf_coeff_P9 = f47 +asinf_x2 = f48 +asinf_x3 = f49 +asinf_x4 = f50 + +asinf_x8 = f51 +asinf_x5 = f52 +asinf_const_piby2 = f53 +asinf_const_sqrt2by2 = f54 +asinf_x11 = f55 + +asinf_poly_p1 = f56 +asinf_poly_p3 = f57 +asinf_sinf1 = f58 +asinf_poly_p2 = f59 +asinf_poly_Ax = f60 + +asinf_poly_p7 = f61 +asinf_poly_p5 = f62 +asinf_sgnx_t4 = f63 +asinf_poly_Bx = f64 +asinf_t = f65 + +asinf_yby2 = f66 +asinf_B = f67 +asinf_B2 = f68 +asinf_Az = f69 +asinf_dz = f70 + +asinf_Sz = f71 +asinf_d2z = f72 +asinf_Fz = f73 +asinf_z = f74 +asinf_sgnx_z = f75 + +asinf_t2 = f76 +asinf_2poly_p4 = f77 +asinf_2poly_p6 = f78 +asinf_2poly_p1 = f79 +asinf_2poly_p2 = f80 + +asinf_2poly_p8 = f81 +asinf_t4 = f82 +asinf_Pt = f83 +asinf_sgnx_2poly_p2 = f84 +asinf_sgn_x_piby2 = f85 + +asinf_poly_p7a = f86 +asinf_2poly_p4a = f87 +asinf_2poly_p4b = f88 +asinf_2poly_p2a = f89 +asinf_poly_p1a = f90 + + + + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +asinf_coeff_1_table: +ASM_TYPE_DIRECTIVE(asinf_coeff_1_table,@object) +data8 0x3FC5555607DCF816 // P1 +data8 0x3F9CF81AD9BAB2C6 // P4 +data8 0x3FC59E0975074DF3 // P7 +data8 0xBFA6F4CC2780AA1D // P6 +data8 0x3FC2DD45292E93CB // P9 +data8 0x3fe6a09e667f3bcd // sqrt(2)/2 +ASM_SIZE_DIRECTIVE(asinf_coeff_1_table) + +asinf_coeff_2_table: +ASM_TYPE_DIRECTIVE(asinf_coeff_2_table,@object) +data8 0x3FA6F108E31EFBA6 // P3 +data8 0xBFCA31BF175D82A0 // P8 +data8 0x3FA30C0337F6418B // P5 +data8 0x3FB332C9266CB1F9 // P2 +data8 0x3ff921fb54442d18 // pi_by_2 +ASM_SIZE_DIRECTIVE(asinf_coeff_2_table) + + +.align 32 +.global asinf + +.section .text +.proc asinf +.align 32 + +asinf: + +// Load the addresses of the two tables. +// Then, load the coefficients and other constants. + +{ .mfi + alloc r32 = ar.pfs,1,8,4,0 + fnma.s1 asinf_t = f8,f8,f1 + dep.z ASINF_GR_1by2 = 0x3f,24,8 // 0x3f000000 +} +{ .mfi + addl ASINF_Addr1 = @ltoff(asinf_coeff_1_table),gp + fma.s1 asinf_x2 = f8,f8,f0 + addl ASINF_Addr2 = @ltoff(asinf_coeff_2_table),gp ;; +} + + +{ .mfi + ld8 ASINF_Addr1 = [ASINF_Addr1] + fmerge.s asinf_abs_x = f1,f8 + dep ASINF_GR_3by2 = -1,r0,22,8 // 0x3fc00000 +} +{ .mlx + nop.m 999 + movl ASINF_GR_5by2 = 0x40200000;; +} + + + +{ .mfi + setf.s asinf_1by2 = ASINF_GR_1by2 + fmerge.s asinf_sgn_x = f8,f1 + nop.i 999 +} +{ .mfi + ld8 ASINF_Addr2 = [ASINF_Addr2] + nop.f 0 + nop.i 999;; +} + + +{ .mfi + setf.s asinf_5by2 = ASINF_GR_5by2 + fcmp.lt.s1 p11,p12 = f8,f0 + nop.i 999;; +} + +{ .mmf + ldfpd asinf_coeff_P1,asinf_coeff_P4 = [ASINF_Addr1],16 + setf.s asinf_3by2 = ASINF_GR_3by2 + fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan +} + + +{ .mfi + ldfpd asinf_coeff_P7,asinf_coeff_P6 = [ASINF_Addr1],16 + fma.s1 asinf_t2 = asinf_t,asinf_t,f0 + nop.i 999 +} +{ .mfi + ldfpd asinf_coeff_P3,asinf_coeff_P8 = [ASINF_Addr2],16 + fma.s1 asinf_x4 = asinf_x2,asinf_x2,f0 + nop.i 999;; +} + + +{ .mfi + ldfpd asinf_coeff_P9,asinf_const_sqrt2by2 = [ASINF_Addr1] + fclass.m.unc p10,p0 = f8, 0x07 //@zero + nop.i 999 +} +{ .mfi + ldfpd asinf_coeff_P5,asinf_coeff_P2 = [ASINF_Addr2],16 + fma.s1 asinf_x3 = f8,asinf_x2,f0 + nop.i 999;; +} + + +{ .mfi + ldfd asinf_const_piby2 = [ASINF_Addr2] + frsqrta.s1 asinf_B,p0 = asinf_t + nop.i 999 +} +{ .mfb + nop.m 999 +(p8) fma.s f8 = f8,f1,f0 +(p8) br.ret.spnt b0 ;; // Exit if x=nan +} + + +{ .mfb + nop.m 999 + fcmp.eq.s1 p6,p0 = asinf_abs_x,f1 +(p10) br.ret.spnt b0 ;; // Exit if x=0 +} + +{ .mfi + nop.m 999 + fcmp.gt.s1 p9,p0 = asinf_abs_x,f1 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asinf_x8 = asinf_x4,asinf_x4,f0 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0 +(p6) br.cond.spnt L(ASINF_ABS_ONE) ;; // Branch if |x|=1 +} + +{ .mfi + nop.m 999 + fma.s1 asinf_x5 = asinf_x2,asinf_x3,f0 + nop.i 999 +} +{ .mfb +(p9) mov GR_Parameter_TAG = 62 + fma.s1 asinf_yby2 = asinf_t,asinf_1by2,f0 +(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1 +} + + +{ .mfi + nop.m 999 + fma.s1 asinf_Az = asinf_t,asinf_B,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_B2 = asinf_B,asinf_B,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p1 = f8,asinf_coeff_P1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p1 = asinf_coeff_P1,asinf_t,f1 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p3 = asinf_coeff_P4,asinf_x2,asinf_coeff_P3 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p6 = asinf_coeff_P7,asinf_t,asinf_coeff_P6 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p7 = asinf_x2,asinf_coeff_P8,asinf_coeff_P7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p2 = asinf_coeff_P3,asinf_t,asinf_coeff_P2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p5 = asinf_x2,asinf_coeff_P6,asinf_coeff_P5 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p4 = asinf_coeff_P5,asinf_t,asinf_coeff_P4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.d.s1 asinf_x11 = asinf_x8,asinf_x3,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 asinf_dz = asinf_B2,asinf_yby2,asinf_1by2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p1a = asinf_x2,asinf_poly_p1,f8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p8 = asinf_coeff_P9,asinf_t,asinf_coeff_P8 + nop.i 999;; +} + + +// Get the absolute value of x and determine the region in which x lies + +{ .mfi + nop.m 999 + fcmp.le.s1 p7,p8 = asinf_abs_x,asinf_const_sqrt2by2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p2 = asinf_x2,asinf_poly_p3,asinf_coeff_P2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 asinf_poly_p7a = asinf_x4,asinf_coeff_P9,asinf_poly_p7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 asinf_2poly_p2a = asinf_2poly_p2,asinf_t2,asinf_2poly_p1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_sgnx_t4 = asinf_sgn_x,asinf_t4,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_2poly_p4a = asinf_2poly_p6,asinf_t2,asinf_2poly_p4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_Sz = asinf_5by2,asinf_dz,asinf_3by2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_d2z = asinf_dz,asinf_dz,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_sgn_x_piby2 = asinf_sgn_x,asinf_const_piby2,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.d.s1 asinf_poly_Ax = asinf_x5,asinf_poly_p2,asinf_poly_p1a + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p7) fma.d.s1 asinf_poly_Bx = asinf_x4,asinf_poly_p7a,asinf_poly_p5 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_sgnx_2poly_p2 = asinf_sgn_x,asinf_2poly_p2a,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_2poly_p4b = asinf_2poly_p8,asinf_t4,asinf_2poly_p4a + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 asinf_Fz = asinf_d2z,asinf_Sz,asinf_dz + nop.i 999;; +} + + +{ .mfi + nop.m 999 +(p8) fma.d.s1 asinf_Pt = asinf_2poly_p4b,asinf_sgnx_t4,asinf_sgnx_2poly_p2 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p8) fma.d.s1 asinf_z = asinf_Az,asinf_Fz,asinf_Az + nop.i 999;; +} + +.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2 +{ .mfi + nop.m 999 +(p8) fnma.s f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p7) fma.s f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax + br.ret.sptk b0 ;; +} + +L(ASINF_ABS_ONE): +// Here for short exit if |x|=1 +{ .mfb + nop.m 999 + fma.s f8 = asinf_sgn_x,asinf_const_piby2,f0 + br.ret.sptk b0 +} +;; + +.endp asinf +ASM_SIZE_DIRECTIVE(asinf) + +// Stack operations when calling error support. +// (1) (2) +// sp -> + psp -> + +// | | +// | | <- GR_Y +// | | +// | <-GR_Y Y2->| +// | | +// | | <- GR_X +// | | +// sp-64 -> + sp -> + +// save ar.pfs save b0 +// save gp + + +// Stack operations when calling error support. +// (3) (call) (4) +// psp -> + sp -> + +// | | +// R3 ->| <- GR_RESULT | -> f8 +// | | +// Y2 ->| <- GR_Y | +// | | +// X1 ->| | +// | | +// sp -> + + +// restore gp +// restore ar.pfs + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +{ .mfi + nop.m 0 + frcpa.s0 f9,p0 = f0,f0 + nop.i 0 +};; + +{ .mib + stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_asinl.S b/sysdeps/ia64/fpu/e_asinl.S new file mode 100644 index 0000000000..32bf4af0e1 --- /dev/null +++ b/sysdeps/ia64/fpu/e_asinl.S @@ -0,0 +1,777 @@ +.file "asinl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// long double = asinl(long double) +// input floating point f8 +// output floating point f8 +// +// Registers used +//============================================================== +// +// predicate registers used: +// p6 -> p12 +// +// floating-point registers used: +// f8 has input, then output +// f32 -> f87, f8 -> f13, f32 -> f87 +// +// general registers used: +// r32 -> r47 +// +// Overview of operation +//============================================================== +// There are three paths +// 1. |x| < 2^-40 ASIN_TINY +// 2. 2^-40 <= |x| < 1/4 ASIN_POLY +// 3. 1/4 <= |x| < 1 ASIN_ATAN + +#include "libm_support.h" + +// Assembly macros +//============================================================== +FR_RESULT = f10 +FR_X = f8 +FR_Y = f1 +asin_P79 = f32 +asin_P59 = f33 +asin_P39 = f34 +asin_P19 = f35 + +asin_P810 = f36 +asin_P610 = f37 +asin_P410 = f38 +asin_P210 = f39 + +asin_A1 = f41 +asin_A2 = f42 +asin_A3 = f43 +asin_A4 = f44 +asin_A5 = f45 +asin_A6 = f46 +asin_A7 = f47 +asin_A8 = f48 +asin_A9 = f49 +asin_A10 = f50 + +asin_X2 = f51 +asin_X4 = f52 + +asin_B = f53 +asin_Bb = f54 +asin_C = f55 +asin_Cc = f56 +asin_D = f57 + +asin_W = f58 +asin_Ww = f59 + +asin_y0 = f60 +asin_y1 = f61 +asin_y2 = f62 + +asin_H = f63 +asin_Hh = f64 + +asin_t1 = f65 +asin_t2 = f66 +asin_t3 = f67 +asin_t4 = f68 +asin_t5 = f69 + +asin_Pseries = f70 +asin_NORM_f8 = f71 +asin_ABS_NORM_f8 = f72 + +asin_2m100 = f73 +asin_P1P2 = f74 +asin_HALF = f75 +asin_1mD = f76 + +asin_1mB = f77 +asin_1mBmC = f78 +asin_S = f79 + +asin_BmWW = f80 +asin_BmWWpb = f81 +asin_2W = f82 +asin_1d2W = f83 +asin_Dd = f84 + +asin_XWw = f85 +asin_low = f86 + +asin_pi_by_2 = f87 +asin_pi_by_2_lo = f88 + +asin_GR_17_ones = r33 +asin_GR_16_ones = r34 +asin_GR_signexp_f8 = r35 +asin_GR_exp = r36 +asin_GR_true_exp = r37 +asin_GR_ff9b = r38 + +GR_SAVE_B0 = r39 +GR_SAVE_SP = r40 +GR_SAVE_PFS = r33 +// r33 can be used safely. +// r40 is address of table of coefficients +// Later it is used to save sp across calls +GR_SAVE_GP = r41 +asin_GR_fffe = r42 +asin_GR_retval = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 +GR_Parameter_TAG = r47 + + +// 2^-40: +// A true exponent of -40 is +// : -40 + register_bias +// : -28 + ffff = ffd7 + +// A true exponent of -100 is +// : -100 + register_bias +// : -64 + ffff = ff9b + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +asin_coefficients: +ASM_TYPE_DIRECTIVE(asin_coefficients,@object) +data8 0xBB08911F2013961E, 0x00003FF8 // A10 +data8 0x981F1095A23A87D3, 0x00003FF8 // A9 +data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8 +data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7 +data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6 +data8 0xB745D09B2B0E850B, 0x00003FF9 // A5 +data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4 +data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3 +data8 0x99999999999AF376, 0x00003FFB // A2 +data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1 + +data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi +data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo +ASM_SIZE_DIRECTIVE(asin_coefficients) + +.align 32 +.global asinl# + +.section .text +.proc asinl# +.align 32 + + +asinl: + +{ .mfi + alloc r32 = ar.pfs,1,11,4,0 +(p0) fnorm asin_NORM_f8 = f8 +(p0) mov asin_GR_17_ones = 0x1ffff +} + +{ .mii +(p0) mov asin_GR_16_ones = 0xffff +(p0) mov asin_GR_ff9b = 0xff9b ;; + nop.i 999 +} + + +{ .mmi +(p0) setf.exp asin_2m100 = asin_GR_ff9b +(p0) addl r40 = @ltoff(asin_coefficients), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r40 = [r40] + nop.m 999 + nop.i 999 +} +;; + + + +// Load the constants + +{ .mmi +(p0) ldfe asin_A10 = [r40],16 ;; +(p0) ldfe asin_A9 = [r40],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe asin_A8 = [r40],16 ;; +(p0) ldfe asin_A7 = [r40],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe asin_A6 = [r40],16 ;; +(p0) getf.exp asin_GR_signexp_f8 = asin_NORM_f8 + nop.i 999 +} + +{ .mmi +(p0) ldfe asin_A5 = [r40],16 ;; +(p0) ldfe asin_A4 = [r40],16 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s asin_ABS_NORM_f8 = f0, asin_NORM_f8 +(p0) and asin_GR_exp = asin_GR_signexp_f8, asin_GR_17_ones ;; +} + +// case 1: |x| < 2^-40 ==> p6 (includes x = +-0) +// case 2: 2^-40 <= |x| < 2^-2 ==> p8 +// case 3: 2^-2 <= |x| < 1 ==> p9 +// case 4: 1 <= |x| ==> p11 +// In case 4, we pick up the special case x = +-1 and return +-pi/2 + +{ .mii +(p0) ldfe asin_A3 = [r40],16 +(p0) sub asin_GR_true_exp = asin_GR_exp, asin_GR_16_ones ;; +(p0) cmp.ge.unc p6, p7 = -41, asin_GR_true_exp ;; +} + +{ .mii +(p0) ldfe asin_A2 = [r40],16 +(p7) cmp.ge.unc p8, p9 = -3, asin_GR_true_exp ;; +(p9) cmp.ge.unc p10, p11 = -1, asin_GR_true_exp +} + +{ .mmi +(p0) ldfe asin_A1 = [r40],16 ;; +(p0) ldfe asin_pi_by_2 = [r40],16 + nop.i 999 +} + +// case 4: |x| >= 1 +{ .mib + nop.m 999 + nop.i 999 +(p11) br.spnt L(ASIN_ERROR_RETURN) ;; +} + +// case 1: |x| < 2^-40 +{ .mfb + nop.m 999 +(p6) fma.s0 f8 = asin_2m100,f8,f8 +(p6) br.ret.spnt b0 ;; +} + + +// case 2: 2^-40 <= |x| < 2^-2 ==> p8 +{ .mfi + nop.m 999 +(p8) fma.s1 asin_X2 = f8,f8, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_X4 = asin_X2,asin_X2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P810 = asin_X4, asin_A10, asin_A8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P79 = asin_X4, asin_A9, asin_A7 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P610 = asin_X4, asin_P810, asin_A6 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P59 = asin_X4, asin_P79, asin_A5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P410 = asin_X4, asin_P610, asin_A4 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P39 = asin_X4, asin_P59, asin_A3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P210 = asin_X4, asin_P410, asin_A2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P19 = asin_X4, asin_P39, asin_A1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P1P2 = asin_X2, asin_P210, asin_P19 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 asin_P1P2 = asin_X2, asin_P1P2, f0 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p8) fma.s0 f8 = asin_NORM_f8, asin_P1P2, asin_NORM_f8 +(p8) br.ret.spnt b0 ;; +} + +// case 3: 2^-2 <= |x| < 1 +// 1- X*X is computed as B + b +// Step 1.1: Get B and b + +// atan2 will return +// f8 = Z_hi +// f10 = Z_lo +// f11 = s_lo + + +{ .mfi +(p0) mov asin_GR_fffe = 0xfffe +(p0) fmerge.se f8 = asin_ABS_NORM_f8, asin_ABS_NORM_f8 +nop.i 0 +};; + +{ .mmf +nop.m 0 +(p0) setf.exp asin_HALF = asin_GR_fffe +(p0) fmerge.se f12 = asin_NORM_f8, asin_NORM_f8 ;; +} + + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p6,p7 = asin_ABS_NORM_f8, asin_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 asin_D = f1,f1,asin_ABS_NORM_f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fms.s1 asin_C = f1,f1,asin_ABS_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 asin_B = asin_C, asin_D, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fms.s1 asin_1mD = f1,f1,asin_D + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 asin_Dd = asin_1mD,f1, asin_ABS_NORM_f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fms.s1 asin_Bb = asin_C, asin_D, asin_B + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 asin_Bb = asin_C, asin_Dd, asin_Bb + nop.i 999 +} + +{ .mfi + nop.m 999 +(p6) fma.s1 asin_C = asin_ABS_NORM_f8, asin_ABS_NORM_f8, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fms.s1 asin_B = f1, f1, asin_C + nop.i 999 +} + +{ .mfi + nop.m 999 +(p6) fms.s1 asin_Cc = asin_ABS_NORM_f8, asin_ABS_NORM_f8, asin_C + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_Hh = asin_HALF, asin_B, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p6) fms.s1 asin_1mB = f1, f1, asin_B + nop.i 999 ;; +} + +// Step 1.2: +// sqrt(B + b) is computed as W + w +// Get W + +{ .mfi + nop.m 999 +(p0) frsqrta.s1 asin_y0,p8 = asin_B + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fms.s1 asin_1mBmC = asin_1mB, f1, asin_C + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_t1 = asin_y0, asin_y0, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fms.s1 asin_Bb = asin_1mBmC, f1, asin_Cc + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 asin_t2 = asin_t1, asin_Hh, asin_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_y1 = asin_t2, asin_y0, asin_y0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_t3 = asin_y1, asin_Hh, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 asin_t4 = asin_t3, asin_y1, asin_HALF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_y2 = asin_t4, asin_y1, asin_y1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_S = asin_B, asin_y2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_H = asin_y2, asin_HALF, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_t5 = asin_Hh, asin_y2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 asin_Dd = asin_S, asin_S, asin_B + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_W = asin_Dd, asin_H, asin_S + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_2W = asin_W, f1, asin_W + nop.i 999 +} + +// Step 1.3 +// Get w +{ .mfi + nop.m 999 +(p0) fnma.s1 asin_BmWW = asin_W, asin_W, asin_B + nop.i 999 ;; +} + +// Step 2 +// asin(x) = atan2(X,sqrt(1-X*X)) +// = atan2(X, W) -Xw +// corr = Xw +// asin(x) = Z_hi + (s_lo*Z_lo - corr) +// Call atan2(X, W) +// Save W in f9 +// Save X in f12 +// Save w in f13 + +{ .mfi + nop.m 999 +(p0) fmerge.se f9 = asin_W, asin_W + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_BmWWpb = asin_BmWW, f1, asin_Bb + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) frcpa.s1 asin_1d2W,p9 = f1, asin_2W + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 asin_Ww = asin_BmWWpb, asin_1d2W, f0 + nop.i 999 ;; +} +.endp asinl +ASM_SIZE_DIRECTIVE(asinl) + +.proc __libm_callout +__libm_callout: +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +};; +{ .mfi + mov GR_SAVE_GP=gp // Save gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +} +.body +{.mfb + nop.m 0 +(p0) fmerge.se f13 = asin_Ww, asin_Ww +(p0) br.call.sptk.many b0=__libm_atan2_reg# +};; +{ .mfi + mov gp = GR_SAVE_GP // Restore gp +(p0) fma.s1 asin_XWw = asin_ABS_NORM_f8,f13,f0 + mov b0 = GR_SAVE_B0 // Restore return address +};; +// asin_XWw = Xw = corr +// asin_low = (s_lo * Z_lo - corr) +// f8 = Z_hi + (s_lo * Z_lo - corr) + +{ .mfi + nop.m 999 +(p0) fms.s1 asin_low = f11, f10, asin_XWw + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs +};; + +{ .mfi + nop.m 999 +(p0) fma.s0 f8 = f8, f1, asin_low + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f12,f8 +(p0) br.ret.sptk b0 ;; +} +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + +.proc SPECIAL +SPECIAL: +L(ASIN_ERROR_RETURN): + +// If X is 1, return (sign of X)pi/2 + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc p6,p7 = asin_ABS_NORM_f8,f1 + nop.i 999 ;; +} + +{ .mfb +(p6) ldfe asin_pi_by_2_lo = [r40] +(p6) fmerge.s asin_pi_by_2 = f8,asin_pi_by_2 + nop.b 0;; +} + +// If X is a NAN, leave +// qnan snan inf norm unorm 0 -+ +// 1 1 0 0 0 0 11 +{ .mfb + nop.m 999 +(p6) fma.s0 f8 = f8,asin_pi_by_2_lo,asin_pi_by_2 +(p6) br.ret.spnt b0 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p12,p0 = f8, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p12) fma.s0 f8 = f8,f1,f0 +(p12) br.ret.spnt b0 ;; +} +{ .mfi +(p0) mov GR_Parameter_TAG = 60 +(p0) frcpa f10, p6 = f0, f0 +nop.i 0 +};; +.endp SPECIAL +ASM_SIZE_DIRECTIVE(SPECIAL) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# + +.type __libm_atan2_reg#,@function +.global __libm_atan2_reg# diff --git a/sysdeps/ia64/fpu/e_atan2.S b/sysdeps/ia64/fpu/e_atan2.S new file mode 100644 index 0000000000..6d6b11be8c --- /dev/null +++ b/sysdeps/ia64/fpu/e_atan2.S @@ -0,0 +1,1124 @@ +.file "atan2.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 9/28/00 Updated to set invalid on SNaN inputs +// 1/19/01 Fixed flags for small results +// +// API +//============================================================== +// double atan2(double Y, double X) +// +// Overview of operation +//============================================================== +// +// There are two basic paths: swap true and swap false. +// atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap. +// +// p6 swap True |Y| > |X| +// p7 swap False |Y| <= |X| +// p8 X+ (If swap=True p8=p9=0) +// p9 X- +// +// all the other predicates p10 thru p15 are false for the main path +// +// Simple trigonometric identities show +// Region 1 (-45 to +45 degrees): +// X>0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (0 + atan(V/U)) +// +// Region 2 (-90 to -45 degrees, and +45 to +90 degrees): +// X>0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U)) +// +// Region 3 (-135 to -90 degrees, and +90 to +135 degrees): +// X<0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 + atan(V/U)) +// +// Region 4 (-180 to -135 degrees, and +135 to +180 degrees): +// X<0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (pi - atan(V/U)) +// +// So the result is always of the form atan2(Y,X) = P + sgnXY * atan(V/U) +// +// We compute atan(V/U) from the identity +// atan(z) + atan([(V/U)-z] / [1+(V/U)z]) +// where z is a limited precision approximation (16 bits) to V/U +// +// z is calculated with the assistance of the frcpa instruction. +// +// atan(z) is calculated by a polynomial z + z^3 * p(w), w=z^2 +// where p(w) = P0+P1*w+...+P22*w^22 +// +// Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z) +// +// Approximate atan(d) by d + P0*d^3 +// Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8. +// Compute q(a) = 1 + a + ... + a^5. +// Then F*q(a) approximates the reciprocal to more than 50 bits. + +// Special values +//============================================================== +// Y x Result +// +number +inf +0 +// -number +inf -0 +// +number -inf +pi +// -number -inf -pi +// +// +inf +number +pi/2 +// -inf +number -pi/2 +// +inf -number +pi/2 +// -inf -number -pi/2 +// +// +inf +inf +pi/4 +// -inf +inf -pi/4 +// +inf -inf +3pi/4 +// -inf -inf -3pi/4 +// +// +1 +1 +pi/4 +// -1 +1 -pi/4 +// +1 -1 +3pi/4 +// -1 -1 -3pi/4 +// +// +number +0 +pi/2 +// -number +0 -pi/2 +// +number -0 +pi/2 +// -number -0 -pi/2 +// +// +0 +number +0 +// -0 +number -0 +// +0 -number +pi +// -0 -number -pi +// +// +0 +0 +0 +// -0 +0 -0 +// +0 -0 +pi +// -0 -0 -pi +// +// Nan anything quiet Y +// anything NaN quiet X + +// atan2(+-0/+-0) sets double error tag to 37 +// atan2(+-0/+-0) sets single error tag to 38 + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +EXP_AD_P1 = r33 +EXP_AD_P2 = r34 +atan2_GR_sml_exp = r35 + + +GR_SAVE_B0 = r35 +GR_SAVE_GP = r36 +GR_SAVE_PFS = r37 + +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +atan2_GR_tag = r41 + + +atan2_X = f9 +atan2_Y = f8 + +atan2_u1_X = f32 +atan2_u1_Y = f33 +atan2_Umax = f34 +atan2_Vmin = f35 +atan2_two = f36 +atan2_absX = f37 +atan2_z1_X = f38 +atan2_z1_Y = f39 +atan2_B1X = f40 +atan2_B1Y = f41 +atan2_wp = f42 +atan2_B1sq = f43 +atan2_z = f44 +atan2_w = f45 + +atan2_P0 = f46 +atan2_P1 = f47 +atan2_P2 = f48 +atan2_P3 = f49 +atan2_P4 = f50 +atan2_P5 = f51 +atan2_P6 = f52 +atan2_P7 = f53 +atan2_P8 = f54 +atan2_P9 = f55 +atan2_P10 = f56 +atan2_P11 = f57 +atan2_P12 = f58 +atan2_P13 = f59 +atan2_P14 = f60 +atan2_P15 = f61 +atan2_P16 = f62 +atan2_P17 = f63 +atan2_P18 = f64 +atan2_P19 = f65 +atan2_P20 = f66 +atan2_P21 = f67 +atan2_P22 = f68 +atan2_Pi_by_2 = f69 + +atan2_V13 = f70 +atan2_W11 = f71 +atan2_E = f72 +atan2_gamma = f73 +atan2_V11 = f74 +atan2_V12 = f75 +atan2_V7 = f76 +atan2_V8 = f77 +atan2_W7 = f78 +atan2_W8 = f79 +atan2_W3 = f80 +atan2_W4 = f81 +atan2_V3 = f82 +atan2_V4 = f83 +atan2_F = f84 +atan2_gV = f85 +atan2_V10 = f86 +atan2_zcub = f87 +atan2_V6 = f88 +atan2_V9 = f89 +atan2_W10 = f90 +atan2_W6 = f91 +atan2_W2 = f92 +atan2_V2 = f93 + +atan2_alpha = f94 +atan2_alpha_1 = f95 +atan2_gVF = f96 +atan2_V5 = f97 +atan2_W12 = f98 +atan2_W5 = f99 +atan2_alpha_sq = f100 +atan2_Cp = f101 +atan2_V1 = f102 + +atan2_sml_norm = f103 +atan2_FR_tmp = f103 + +atan2_W1 = f104 +atan2_alpha_cub = f105 +atan2_C = f106 +atan2_P = f107 +atan2_d = f108 +atan2_A_hi = f109 +atan2_dsq = f110 +atan2_pd = f111 +atan2_A_lo = f112 +atan2_A = f113 + +atan2_Pp = f114 + +atan2_sgnY = f116 +atan2_pi = f117 +atan2_sgnX = f118 +atan2_sgnXY = f119 + +atan2_3pi_by_4 = f120 +atan2_pi_by_4 = f121 + +//atan2_sF = p7 +//atan2_sT = p6 + +// These coefficients are for atan2. +// You can also use this set to substitute those used in the |X| <= 1 case for atan; +// BUT NOT vice versa. + +///////////////////////////////////////////////////////////// + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +atan2_tb1: +ASM_TYPE_DIRECTIVE(atan2_tb1,@object) +data8 0xB199DD6D2675C40F , 0x0000BFFA // P10 +data8 0xA21922DC45605EA1 , 0x00003FFA // P11 +data8 0xD78F28FC2A592781 , 0x0000BFFA // P8 +data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9 +data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5 +data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7 +data8 0xF396268151CFB11C , 0x00003FF7 // P17 +data8 0x9D3436AABE218776 , 0x00003FF5 // P19 +data8 0x80D601879218B53A , 0x00003FFA // P13 +data8 0xA2270D30A90AA220 , 0x00003FF9 // P15 +data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1 +data8 0xE38E38E320A8A098 , 0x00003FFB // P3 +data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22 +data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4 +ASM_SIZE_DIRECTIVE(atan2_tb1) + +atan2_tb2: +ASM_TYPE_DIRECTIVE(atan2_tb2,@object) +data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20 +data8 0xCE585A259BD8374C , 0x00003FF0 // P21 +data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4 +data8 0x88887EBB209E3543 , 0x0000BFFB // P6 +data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16 +data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18 +data8 0x9297B23CCFFB291F , 0x0000BFFA // P12 +data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14 +data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0 +data8 0x9249249247E37913 , 0x0000BFFC // P2 +data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2 +data8 0xC90FDAA22168C235 , 0x00004000 // pi +data8 0x96cbe3f9990e91a8 , 0x00004000 // 3pi/4 +ASM_SIZE_DIRECTIVE(atan2_tb2) + + + + +.align 32 +.global atan2# +#ifdef _LIBC +.global __atan2# +.global __ieee754_atan2# +#endif + +//////////////////////////////////////////////////////// + +.section .text +.align 32 + +.proc atan2# +atan2: +#ifdef _LIBC +.proc __atan2# +__atan2: +.proc __ieee754_atan2# +__ieee754_atan2: +#endif +// qnan snan inf norm unorm 0 -+ +// 0 0 1 0 0 0 11 + + +// Y NAN? p10 p11 +// p10 ==> quiet Y and return +// p11 X NAN? p12, p13 +// p12 ==> quiet X and return + +{ .mfi + alloc r32 = ar.pfs,1,5,4,0 + frcpa.s1 atan2_u1_X,p6 = f1,atan2_X + addl EXP_AD_P2 = @ltoff(atan2_tb2), gp +} +{ .mfi + addl EXP_AD_P1 = @ltoff(atan2_tb1), gp + fclass.m.unc p10,p11 = f8, 0xc3 + nop.i 999 +;; +} + +{ .mfi + ld8 EXP_AD_P1 = [EXP_AD_P1] + frcpa.s1 atan2_u1_Y,p7 = f1,atan2_Y + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_two = f1,f1,f1 + nop.i 999 +;; +} + + +{ .mfi + ld8 EXP_AD_P2 = [ EXP_AD_P2] + famax.s1 atan2_Umax = f8,f9 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fmerge.s atan2_absX = f0,atan2_X + nop.i 999 +} +;; + +// p10 Y NAN, quiet and return +{ .mfi + ldfe atan2_P10 = [EXP_AD_P1],16 + fmerge.s atan2_sgnY = atan2_Y,f1 + nop.i 999 +} +{ .mfb + nop.m 999 +(p10) fma.d f8 = f8,f9,f0 +(p10) br.ret.spnt b0 +;; +} + + +{ .mmf + ldfe atan2_P11 = [EXP_AD_P1],16 + ldfe atan2_P20 = [EXP_AD_P2],16 + fmerge.s atan2_sgnX = atan2_X,f1 +;; +} + + +{ .mfi + ldfe atan2_P8 = [EXP_AD_P1],16 + fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0 + nop.i 999 +} +{ .mfi + + ldfe atan2_P21 = [EXP_AD_P2],16 + fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0 + nop.i 999 +;; +} + +{ .mfi + ldfe atan2_P9 = [EXP_AD_P1],16 + fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two + nop.i 999 +} +{ .mfi + + ldfe atan2_P4 = [EXP_AD_P2],16 + fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two + nop.i 999 +;; +} + +// p6 (atan2_sT) true if swap +// p7 (atan2_sF) true if no swap +// p11 ==> Y !NAN; X NAN? + +{ .mfi + ldfe atan2_P5 = [EXP_AD_P1],16 +// fcmp.eq.unc.s1 atan2_sF,atan2_sT = atan2_Umax, atan2_X + fcmp.eq.unc.s1 p7,p6 = atan2_Umax, atan2_X + nop.i 999 +} +{ .mfi + ldfe atan2_P6 = [EXP_AD_P2],16 +(p11) fclass.m.unc p12,p13 = f9, 0xc3 + nop.i 999 +;; +} + +{ .mmf + ldfe atan2_P7 = [EXP_AD_P1],16 + ldfe atan2_P16 = [EXP_AD_P2],16 + famin.s1 atan2_Vmin = f8,f9 +;; +} + +// p8 true if X positive +// p9 true if X negative +// both are false is swap is true +{ .mfi + ldfe atan2_P17 = [EXP_AD_P1],16 +//(atan2_sF) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1 +(p7) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1 + nop.i 999 +} +{ .mfi + ldfe atan2_P18 = [EXP_AD_P2],16 + fma.s1 atan2_sgnXY = atan2_sgnX, atan2_sgnY, f0 + nop.i 999 +;; +} + + +{ .mfi + ldfe atan2_P19 = [EXP_AD_P1],16 +//(atan2_sF) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0 +(p7) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0 + nop.i 999 +} +{ .mfi + ldfe atan2_P12 = [EXP_AD_P2],16 +//(atan2_sT) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0 +(p6) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0 + nop.i 999 +;; +} + + +{ .mfi + ldfe atan2_P13 = [EXP_AD_P1],16 +//(atan2_sF) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 +(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 + nop.i 999 +} +{ .mfi + ldfe atan2_P14 = [EXP_AD_P2],16 +//(atan2_sT) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0 +(p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0 + nop.i 999 +;; +} + + +{ .mfi + ldfe atan2_P15 = [EXP_AD_P1],16 +//(atan2_sF) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0 +(p7) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0 + nop.i 999 +} +{ .mfi + ldfe atan2_P0 = [EXP_AD_P2],16 +//(atan2_sT) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0 +(p6) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0 + nop.i 999 +;; +} + + +// p12 ==> X NAN, quiet and return +{ .mfi + ldfe atan2_P1 = [EXP_AD_P1],16 + fmerge.s atan2_Umax = f0,atan2_Umax + nop.i 999 +} +{ .mfb + ldfe atan2_P2 = [EXP_AD_P2],16 +(p12) fma.d f8 = f9,f8,f0 +(p12) br.ret.spnt b0 +;; +} + + +// p10 ==> x inf y ? +// p11 ==> x !inf y ? +{ .mfi + ldfe atan2_P3 = [EXP_AD_P1],16 + fmerge.s atan2_Vmin = f0,atan2_Vmin + nop.i 999 +} +{ .mfi + ldfe atan2_Pi_by_2 = [EXP_AD_P2],16 + fclass.m.unc p10,p11 = f9, 0x23 + nop.i 999 +;; +} + + +{ .mmf + ldfe atan2_P22 = [EXP_AD_P1],16 + ldfe atan2_pi = [EXP_AD_P2],16 + nop.f 999 +;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p12,p13=f9,f8 // Dummy to catch denormal and invalid + nop.i 999 +;; +} + + +{ .mfi + ldfe atan2_pi_by_4 = [EXP_AD_P1],16 +//(atan2_sT) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY +(p6) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY + nop.i 999 +} +{ .mfi + ldfe atan2_3pi_by_4 = [EXP_AD_P2],16 + fma.s1 atan2_w = atan2_wp, atan2_B1sq,f0 + nop.i 999 +;; +} + +// p12 ==> x inf y inf +// p13 ==> x inf y !inf +{ .mfi + nop.m 999 + fmerge.s atan2_z = f0, atan2_z + nop.i 999 +;; +} + +{ .mfi + nop.m 99 +(p10) fclass.m.unc p12,p13 = f8, 0x23 + nop.i 999 +} +{ .mfi + nop.m 99 +(p11) fclass.m.unc p14,p15 = f8, 0x23 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 +(p12) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 + nop.i 99 +;; +} + + +{ .mfb + mov atan2_GR_sml_exp = 0x1 // Small exponent for making small norm +(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0 +(p14) br.ret.spnt b0 +;; +} + +// Make a very small normal in case need to force inexact and underflow +{ .mfi + setf.exp atan2_sml_norm = atan2_GR_sml_exp + fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W11 = atan2_w, atan2_P21, atan2_P20 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_E = atan2_Vmin, atan2_z, atan2_Umax + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 atan2_gamma = atan2_Umax, atan2_z, f1 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V12 = atan2_w, atan2_w, f0 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W4 = atan2_w, atan2_P15, atan2_P14 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_zcub = atan2_z, atan2_w, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 atan2_gV = atan2_Umax, atan2_z, atan2_Vmin + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + frcpa.s1 atan2_F,p15 = f1, atan2_E + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W6 = atan2_V12, atan2_W8 , atan2_W7 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3 + nop.i 999 +;; +} + + +// Both X and Y are INF +// p10 ==> X + +// p11 ==> X - +.pred.rel "mutex",p10,p11 +{ .mfb + nop.m 999 +(p10) fma.d f8 = atan2_sgnY, atan2_pi_by_4, f0 +(p10) br.ret.spnt b0 +} +{ .mfb + nop.m 999 +(p11) fma.d f8 = atan2_sgnY, atan2_3pi_by_4, f0 +(p11) br.ret.spnt b0 +;; +} + + +.pred.rel "mutex",p8,p9,p6 +{ .mfi + nop.m 999 + fnma.s1 atan2_alpha = atan2_E, atan2_F, f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 +//(atan2_sT) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2 +(p6) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_gVF = atan2_gV, atan2_F, f0 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W12 = atan2_V9, atan2_V9, f0 + nop.i 999 +;; +} + + + +{ .mfi + nop.m 999 +(p8) fmerge.s atan2_P = atan2_sgnY, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W5 = atan2_V9, atan2_W10, atan2_W6 + nop.i 999 +;; +} + + + + +{ .mfi + nop.m 999 +(p9) fmerge.s atan2_P = atan2_sgnY, atan2_pi + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0 + nop.i 999 +;; +} + + +// p13 ==> x inf y !inf +{ .mfi + nop.m 999 + fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0 + nop.i 999 +;; +} + +.pred.rel "mutex",p10,p11 +// x inf y !inf +{ .mfb + nop.m 999 +(p10) fmerge.s f8 = atan2_sgnY, f0 +(p10) br.ret.spnt b0 +} +{ .mfb + nop.m 999 +(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0 +(p11) br.ret.spnt b0 +;; +} + + + +// p10 ==> y 0 x? +// p11 ==> y !0 x? +{ .mfi + nop.m 999 + fclass.m.unc p10,p11 = f8, 0x07 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 +(p8) fmerge.s atan2_sml_norm = atan2_sgnY, atan2_sml_norm + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C + nop.i 999 +;; +} + +// p12 ==> y0 x0 +// p13 ==> y0 x!0 +// p14 ==> y!0 x0 +// p15 ==> y!0 x!0 +{ .mfi + nop.m 999 +(p10) fclass.m.unc p12,p13 = f9, 0x07 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fclass.m.unc p14,p15 = f9, 0x07 + nop.i 999 +;; +} + + + + +{ .mfb + nop.m 999 +(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 +(p12) br.spnt ATAN2_ERROR +;; +} + + + +{ .mfi + nop.m 999 + fma.s1 atan2_pd = atan2_P0, atan2_d, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2_dsq = atan2_d, atan2_d, f0 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z + nop.i 999 +} +{ .mfb + nop.m 999 +(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0 +(p14) br.ret.spnt b0 +;; +} + + + +{ .mfb + nop.m 999 +(p10) fmerge.s f8 = atan2_sgnY, f0 +(p10) br.ret.spnt b0 +} +{ .mfb + nop.m 999 +(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0 +(p11) br.ret.spnt b0 +;; +} + + + +{ .mfi + nop.m 999 + fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo + nop.i 999 +;; +} + +// Force inexact and possibly underflow if very small results +{ .mfi + nop.m 999 +(p8) fma.d atan2_FR_tmp = atan2_sgnXY, atan2_A, atan2_sml_norm + nop.i 999 +} +{ .mfb + nop.m 999 + fma.d f8 = atan2_sgnXY, atan2_A, atan2_P + br.ret.sptk b0 +;; +} + +ATAN2_ERROR: + +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 + nop.i 999 +} +;; + +{ .mfi + mov atan2_GR_tag = 37 +(p10) fmerge.s f10 = atan2_sgnY, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fma.d f10 = atan2_sgnY, atan2_pi, f0 + nop.i 999 +;; +} +.endp atan2# +ASM_SIZE_DIRECTIVE(atan2#) + + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + +.proc __libm_error_region +__libm_error_region: +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f8,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_atan2f.S b/sysdeps/ia64/fpu/e_atan2f.S new file mode 100644 index 0000000000..85d25a78ad --- /dev/null +++ b/sysdeps/ia64/fpu/e_atan2f.S @@ -0,0 +1,907 @@ +.file "atan2f.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 6/1/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 6/01/00 Initial version +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 1/05/01 Fixed flag settings for denormal input. +// 1/19/01 Added documentation +// 1/30/01 Improved speed + +// Description +//========================================= +// The atan2 function computes the principle value of the arc tangent of y/x using +// the signs of both arguments to determine the quadrant of the return value. +// A domain error may occur if both arguments are zero. + +// The atan2 function returns the arc tangent of y/x in the range [-pi,+pi] radians. + +//.. +//..Let (v,u) = (y,x) if |y| <= |x|, and (v,u) = (x,y) otherwise. Note that +//..v and u can be negative. We state the relationship between atan2(y,x) and +//..atan(v/u). +//.. +//..Let swap = false if v = y, and swap = true if v = x. +//..Define C according to the matrix +//.. +//.. TABLE FOR C +//.. x +ve x -ve +//.. no swap (swap = false) sgn(y)*0 sgn(y)*pi +//.. swap (swap = true ) sgn(y)*pi/2 sgn(y)*pi/2 +//.. +//.. atan2(y,x) = C + atan(v/u) if no swap +//.. atan2(y,x) = C - atan(v/u) if swap +//.. +//..These relationship is more efficient to compute as we accommodate signs in v and u +//..saving the need to obtain the absolute value before computation can proceed. +//.. +//..Suppose (v,u) = (y,x), we calculate atan(v/u) as follows: +//..A = y * frcpa(x) (so A = (y/x)(1 - beta)) +//..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is +//..a correction. +//..atan(A) is approximated by a polynomial +//..A + p1 A^3 + p2 A^5 + ... + p10 A^21, +//..atan(G) is approximated as follows: +//..Let G = (y - Ax)/(x + Ay), atan(G) can be approximated by G + g * p1 +//..where g is a limited precision approximation to G via g = (y - Ax)*frcpa(x + Ay). +//.. +//..Suppose (v,u) = (x,y), we calculate atan(v/u) as follows: +//..Z = x * frcpa(y) (so Z = (x/y)(1 - beta)) +//..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is +//..a correction. +//..atan(Z) is approximated by a polynomial +//..Z + p1 Z^3 + p2 Z^5 + ... + p10 Z^21, +//..atan(T) is approximated as follows: +//..Let T = (x - Ay)/(y + Ax), atan(T) can be approximated by T + t * p1 +//..where t is a limited precision approximation to T via t = (x - Ay)*frcpa(y + Ax). +//.. +//.. +//..A = y * frcpa(x) +//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21 +//.. +//..This polynomial is computed as follows: +//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq +//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6 +//.. +//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6 +//..poly_A1 = poly_A2 + A4 * poly_A1 +//..poly_A1 = poly_A3 + A4 * poly_A1 +//.. +//..poly_A4 = p1 * A +//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4 +//..poly_A5 = p2 + Asq * poly_A5 +//..poly_A4 = poly_A4 + A5 * poly_A5 +//.. +//..atan_A = poly_A4 + A11 * poly_A1 +//.. +//..atan(G) is approximated as follows: +//..G_numer = y - A*x, G_denom = x + A*y +//..H1 = frcpa(G_denom) +//..H_beta = 1 - H1 * G_denom +//..H2 = H1 + H1 * H_beta +//..H_beta2 = H_beta*H_beta +//..H3 = H2 + H2*H_beta2 +//..g = H1 * G_numer; gsq = g*g; atan_G = g*p1, atan_G = atan_G*gsq +//..atan_G = G_numer*H3 + atan_G +//.. +//.. +//..A = y * frcpa(x) +//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21 +//.. +//..This polynomial is computed as follows: +//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq +//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6 +//.. +//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6 +//..poly_A1 = poly_A2 + A4 * poly_A1 +//..poly_A1 = poly_A3 + A4 * poly_A1 +//.. +//..poly_A4 = p1 * A +//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4 +//..poly_A5 = p2 + Asq * poly_A5 +//..poly_A4 = poly_A4 + A5 * poly_A5 +//.. +//..atan_A = poly_A4 + A11 * poly_A1 +//.. +//.. +//..==================================================================== +//.. COEFFICIENTS USED IN THE COMPUTATION +//..==================================================================== + +//coef_pj, j = 1,2,...,10; atan(A) ~=~ A + p1 A^3 + p2 A^5 + ... + p10 A^21 +// +// coef_p1 = -.3333332707155439167401311806315789E+00 +// coef_p1 in dbl = BFD5 5555 1219 1621 +// +// coef_p2 = .1999967670926658391827857030875748E+00 +// coef_p2 in dbl = 3FC9 997E 7AFB FF4E +// +// coef_p3 = -.1427989384500152360161563301087296E+00 +// coef_p3 in dbl = BFC2 473C 5145 EE38 +// +// coef_p4 = .1105852823460720770079031213661163E+00 +// coef_p4 in dbl = 3FBC 4F51 2B18 65F5 +// +// coef_p5 = -.8811839915595312348625710228448363E-01 +// coef_p5 in dbl = BFB6 8EED 6A8C FA32 +// +// coef_p6 = .6742329836955067042153645159059714E-01 +// coef_p6 in dbl = 3FB1 42A7 3D7C 54E3 +// +// coef_p7 = -.4468571068774672908561591262231909E-01 +// coef_p7 in dbl = BFA6 E10B A401 393F +// +// coef_p8 = .2252333246746511135532726960586493E-01 +// coef_p8 in dbl = 3F97 105B 4160 F86B +// +// coef_p9 = -.7303884867007574742501716845542314E-02 +// coef_p9 in dbl = BF7D EAAD AA33 6451 +// +// coef_p10 = .1109686868355312093949039454619058E-02 +// coef_p10 in dbl = 3F52 2E5D 33BC 9BAA +// + +// Special values +//============================================================== +// Y x Result +// +number +inf +0 +// -number +inf -0 +// +number -inf +pi +// -number -inf -pi +// +// +inf +number +pi/2 +// -inf +number -pi/2 +// +inf -number +pi/2 +// -inf -number -pi/2 +// +// +inf +inf +pi/4 +// -inf +inf -pi/4 +// +inf -inf +3pi/4 +// -inf -inf -3pi/4 +// +// +1 +1 +pi/4 +// -1 +1 -pi/4 +// +1 -1 +3pi/4 +// -1 -1 -3pi/4 +// +// +number +0 +pi/2 // does not raise DBZ +// -number +0 -pi/2 // does not raise DBZ +// +number -0 +pi/2 // does not raise DBZ +// -number -0 -pi/2 // does not raise DBZ +// +// +0 +number +0 +// -0 +number -0 +// +0 -number +pi +// -0 -number -pi +// +// +0 +0 +0 // does not raise invalid +// -0 +0 -0 // does not raise invalid +// +0 -0 +pi // does not raise invalid +// -0 -0 -pi // does not raise invalid +// +// Nan anything quiet Y +// anything NaN quiet X + +// atan2(+-0/+-0) sets double error tag to 37 +// atan2f(+-0/+-0) sets single error tag to 38 +// These are domain errors. + +#include "libm_support.h" + +// +// Assembly macros +//========================================= + + +// integer registers +atan2f_GR_Addr_1 = r33 +atan2f_GR_Addr_2 = r34 +GR_SAVE_B0 = r35 + +GR_SAVE_PFS = r36 +GR_SAVE_GP = r37 + +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +// floating point registers +atan2f_coef_p1 = f32 +atan2f_coef_p10 = f33 +atan2f_coef_p7 = f34 +atan2f_coef_p6 = f35 + +atan2f_coef_p3 = f36 +atan2f_coef_p2 = f37 +atan2f_coef_p9 = f38 +atan2f_coef_p8 = f39 +atan2f_coef_p5 = f40 + +atan2f_coef_p4 = f41 +atan2f_const_piby2 = f42 +atan2f_const_pi = f43 +atan2f_const_piby4 = f44 +atan2f_const_3piby4 = f45 + +atan2f_xsq = f46 +atan2f_ysq = f47 +atan2f_xy = f48 +atan2f_const_1 = f49 +atan2f_sgn_Y = f50 + +atan2f_Z0 = f51 +atan2f_A0 = f52 +atan2f_Z = f53 +atan2f_A = f54 +atan2f_C = f55 + +atan2f_U = f56 +atan2f_Usq = f57 +atan2f_U4 = f58 +atan2f_U6 = f59 +atan2f_U8 = f60 + +atan2f_poly_u109 = f61 +atan2f_poly_u87 = f62 +atan2f_poly_u65 = f63 +atan2f_poly_u43 = f64 +atan2f_poly_u21 = f65 + +atan2f_poly_u10to7 = f66 +atan2f_poly_u6to3 = f67 +atan2f_poly_u10to3 = f68 +atan2f_poly_u10to0 = f69 +atan2f_poly_u210 = f70 + +atan2f_T_numer = f71 +atan2f_T_denom = f72 +atan2f_G_numer = f73 +atan2f_G_denom = f74 +atan2f_p1rnum = f75 + +atan2f_R_denom = f76 +atan2f_R_numer = f77 +atan2f_pR = f78 +atan2f_pRC = f79 +atan2f_pQRC = f80 + +atan2f_Q1 = f81 +atan2f_Q_beta = f82 +atan2f_Q2 = f83 +atan2f_Q_beta2 = f84 +atan2f_Q3 = f85 + +atan2f_r = f86 +atan2f_rsq = f87 +atan2f_poly_atan_U = f88 + + +// predicate registers +//atan2f_Pred_Swap = p6 // |y| > |x| +//atan2f_Pred_noSwap = p7 // |y| <= |x| +//atan2f_Pred_Xpos = p8 // x >= 0 +//atan2f_Pred_Xneg = p9 // x < 0 + + +.data + +.align 16 + +atan2f_coef_table1: +ASM_TYPE_DIRECTIVE(atan2f_coef_table1,@object) +data8 0xBFD5555512191621 // p1 +data8 0x3F522E5D33BC9BAA // p10 +data8 0xBFA6E10BA401393F // p7 +data8 0x3FB142A73D7C54E3 // p6 +data8 0xBFC2473C5145EE38 // p3 +data8 0x3FC9997E7AFBFF4E // p2 +ASM_SIZE_DIRECTIVE(atan2f_coef_table1) + +atan2f_coef_table2: +ASM_TYPE_DIRECTIVE(atan2f_coef_table2,@object) +data8 0xBF7DEAADAA336451 // p9 +data8 0x3F97105B4160F86B // p8 +data8 0xBFB68EED6A8CFA32 // p5 +data8 0x3FBC4F512B1865F5 // p4 +data8 0x3ff921fb54442d18 // pi/2 +data8 0x400921fb54442d18 // pi +data8 0x3fe921fb54442d18 // pi/4 +data8 0x4002d97c7f3321d2 // 3pi/4 +ASM_SIZE_DIRECTIVE(atan2f_coef_table2) + + + +.global atan2f +#ifdef _LIBC +.global __atan2f +.global __ieee754_atan2f +#endif + +.text +.align 32 + +atan2f: +.proc atan2f +#ifdef _LIBC +.proc __atan2f +__atan2f: +.proc __ieee754_atan2f +__ieee754_atan2f: +#endif + + + +{ .mfi + alloc r32 = ar.pfs,1,5,4,0 + frcpa.s1 atan2f_Z0,p0 = f1,f8 // Approx to 1/y + nop.i 999 +} +{ .mfi + addl atan2f_GR_Addr_1 = @ltoff(atan2f_coef_table1),gp + fma.s1 atan2f_xsq = f9,f9,f0 + nop.i 999 ;; +} + + +{ .mfi + ld8 atan2f_GR_Addr_1 = [atan2f_GR_Addr_1] + frcpa.s1 atan2f_A0,p0 = f1,f9 // Approx to 1/x + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_ysq = f8,f8,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.ge.s1 p8,p9 = f9,f0 // Set p8 if x>=0, p9 if x<0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_xy = f9,f8,f0 + nop.i 999 ;; +} + + +{ .mfi + add atan2f_GR_Addr_2 = 0x30, atan2f_GR_Addr_1 + fmerge.s atan2f_sgn_Y = f8,f1 + nop.i 999 ;; +} + +{ .mmf + ldfpd atan2f_coef_p1,atan2f_coef_p10 = [atan2f_GR_Addr_1],16 + ldfpd atan2f_coef_p9,atan2f_coef_p8 = [atan2f_GR_Addr_2],16 + fclass.m p10,p0 = f9,0xe7 // Test x @inf|@snan|@qnan|@zero +} +;; + +{ .mfi + ldfpd atan2f_coef_p7,atan2f_coef_p6 = [atan2f_GR_Addr_1],16 + fma.s1 atan2f_T_denom = atan2f_Z0,atan2f_xsq,f8 + nop.i 999 +} +{ .mfi + ldfpd atan2f_coef_p5,atan2f_coef_p4 = [atan2f_GR_Addr_2],16 + fma.s1 atan2f_Z = atan2f_Z0,f9,f0 + nop.i 999 ;; +} + + +{ .mfi + ldfpd atan2f_coef_p3,atan2f_coef_p2 = [atan2f_GR_Addr_1],16 + fma.s1 atan2f_G_denom = atan2f_A0,atan2f_ysq,f9 + nop.i 999 +} +{ .mfi + ldfpd atan2f_const_piby2,atan2f_const_pi = [atan2f_GR_Addr_2],16 + fma.s1 atan2f_A = atan2f_A0,f8,f0 + nop.i 999 ;; +} + +{ .mfi + ldfpd atan2f_const_piby4,atan2f_const_3piby4 = [atan2f_GR_Addr_2] + fclass.m p11,p0 = f8,0xe7 // Test y @inf|@snan|@qnan|@zero + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 atan2f_T_numer = atan2f_Z0,atan2f_xy,f9 +(p10) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on x nan,inf,zero +} + + +// p6 if |y|>|x|, p7 if |x|>=|y| , use xsq and ysq for test +{ .mfi + nop.m 999 + fcmp.gt.s1 p6,p7 = atan2f_ysq,atan2f_xsq + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 atan2f_G_numer = atan2f_A0,atan2f_xy,f8 +(p11) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on y nan,inf,zero +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f0,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f1,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fnma.s1 atan2f_U = atan2f_Z,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 atan2f_Usq = atan2f_Z,atan2f_Z,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_U = atan2f_A,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_Usq = atan2f_A,atan2f_A,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_T_denom + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 atan2f_R_denom = atan2f_T_denom,f1,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_G_denom + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_R_denom = atan2f_G_denom,f1,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fnma.s1 atan2f_R_numer = atan2f_T_numer,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_R_numer = atan2f_G_numer,f1,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fnma.s1 atan2f_p1rnum = atan2f_T_numer,atan2f_coef_p1,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_p1rnum = atan2f_G_numer,atan2f_coef_p1,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2f_U4 = atan2f_Usq,atan2f_Usq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u109 = atan2f_Usq,atan2f_coef_p10,atan2f_coef_p9 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u87 = atan2f_Usq,atan2f_coef_p8,atan2f_coef_p7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u65 = atan2f_Usq,atan2f_coef_p6,atan2f_coef_p5 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u43 = atan2f_Usq,atan2f_coef_p4,atan2f_coef_p3 + nop.i 999 +} +{ .mfi + nop.m 999 + fnma.s1 atan2f_Q_beta = atan2f_Q1,atan2f_R_denom,f1 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u21 = atan2f_Usq,atan2f_coef_p2,atan2f_coef_p1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_r = atan2f_Q1,atan2f_R_numer,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fma.s1 atan2f_C = atan2f_sgn_Y,atan2f_const_piby2,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan2f_C = atan2f_const_1,atan2f_const_pi,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_U6 = atan2f_U4,atan2f_Usq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_U8 = atan2f_U4,atan2f_U4,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u10to7 = atan2f_U4,atan2f_poly_u109,atan2f_poly_u87 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_pR = atan2f_p1rnum,atan2f_Q1,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u6to3 = atan2f_U4,atan2f_poly_u65,atan2f_poly_u43 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_Q2 = atan2f_Q1,atan2f_Q_beta,atan2f_Q1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_Q_beta2 = atan2f_Q_beta,atan2f_Q_beta,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_rsq = atan2f_r,atan2f_r,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u210 = atan2f_Usq,atan2f_poly_u21,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p8,p0 = f8,f9 // Dummy op to set flag on denormal inputs + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u10to3 = atan2f_U8,atan2f_poly_u10to7,atan2f_poly_u6to3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_Q3 = atan2f_Q2,atan2f_Q_beta2,atan2f_Q2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atan2f_pRC = atan2f_rsq,atan2f_pR,atan2f_C + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_poly_u10to0 = atan2f_U6,atan2f_poly_u10to3,atan2f_poly_u210 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 atan2f_pQRC = atan2f_R_numer,atan2f_Q3,atan2f_pRC + nop.i 999 ;; +} + +{ .mfb + nop.m 999 + fma.s.s0 f8 = atan2f_U,atan2f_poly_u10to0,atan2f_pQRC + br.ret.sptk b0 ;; +} + + + +ATAN2F_XY_INF_NAN_ZERO: + +{ .mfi + nop.m 999 + fclass.m p10,p0 = f8,0xc3 // Is y nan + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fclass.m p12,p0 = f9,0xc3 // Is x nan + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fclass.m p6,p0 = f9,0x21 // Is x +inf + nop.i 999 +} +{ .mfb + nop.m 999 +(p10) fma.s f8 = f9,f8,f0 // Result quietized y if y is nan +(p10) br.ret.spnt b0 // Exit if y is nan +} +;; + + +{ .mfi + nop.m 999 +(p6) fclass.m.unc p7,p8 = f8,0x23 // x +inf, is y inf + nop.i 999 +} +{ .mfb + nop.m 999 +(p12) fnorm.s f8 = f9 // Result quietized x if x is nan, y not nan +(p12) br.ret.spnt b0 // Exit if x is nan, y not nan +} +;; + +// Here if x or y inf, or x or y zero +{ .mfi + nop.m 999 + fcmp.eq.s0 p15,p0 = f8,f9 // Dummy op to set flag on denormal inputs + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fclass.m p11,p12 = f9,0x22 // Is x -inf + nop.i 999 +} +{ .mfb + nop.m 999 +(p7) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4 +(p7) br.ret.spnt b0 // Exit if x +inf and y inf +} +;; + +{ .mfb + nop.m 999 +(p8) fmerge.s f8 = f8,f0 // If x +inf and y not inf, result +-0 +(p8) br.ret.spnt b0 // Exit if x +inf and y not inf +} +;; + +{ .mfi + nop.m 999 +(p12) fclass.m.unc p13,p0 = f8,0x23 // x not -inf, is y inf + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p11) fclass.m.unc p14,p15 = f8,0x23 // x -inf, is y inf + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fclass.m p6,p7 = f9,0x7 // Is x zero + nop.i 999 +} +{ .mfb + nop.m 999 +(p13) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2 +(p13) br.ret.spnt b0 // Exit if x not -inf and y inf +} +;; + +{ .mfi + nop.m 999 +(p14) fma.s f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4 + nop.i 999 +} +{ .mfb + nop.m 999 +(p15) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi +(p11) br.ret.spnt b0 // Exit if x -inf +} +;; + +// Here if x or y zero +{ .mfi + nop.m 999 +(p7) fclass.m.unc p8,p9 = f9,0x19 // x not zero, y zero, is x > zero + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p6) fclass.m.unc p10,p11 = f8,0x7 // x zero, is y zero + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p8) fmerge.s f8 = f8, f0 // x > zero and y zero, result is +-zero + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi +(p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero +} +;; + +{ .mfb + nop.m 999 +(p11) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero + br.ret.sptk b0 // Final special case exit +} +;; + + +.endp atan2f +ASM_SIZE_DIRECTIVE(atan2f) + + +.proc __libm_error_region +__libm_error_region: +.prologue + mov GR_Parameter_TAG = 38 + fclass.m p10,p11 = f9,0x5 // @zero | @pos +;; +(p10) fmerge.s f10 = f8, f0 +(p11) fma.s f10 = atan2f_sgn_Y, atan2f_const_pi,f0 +;; + +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} + +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +} +;; + +{ .mmi + stfs [GR_Parameter_Y] = f9,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +} +;; + + +.body +{ .mib + stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = f10 // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +} +;; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +} +;; + +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +} +;; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_atan2l.c b/sysdeps/ia64/fpu/e_atan2l.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_atan2l.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_cosh.S b/sysdeps/ia64/fpu/e_cosh.S new file mode 100644 index 0000000000..1ac0e1c29d --- /dev/null +++ b/sysdeps/ia64/fpu/e_cosh.S @@ -0,0 +1,1142 @@ +.file "cosh.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// double = cosh(double) +// input floating point f8 +// output floating point f8 + + +// Overview of operation +//============================================================== +// There are four paths + +// 1. |x| < 0.25 COSH_BY_POLY +// 2. |x| < 32 COSH_BY_TBL +// 3. |x| < 2^14 COSH_BY_EXP +// 4. |x_ >= 2^14 COSH_HUGE + +// For paths 1, and 2 SAFE is always 1. +// For path 4, Safe is always 0. +// SAFE = 1 means we cannot overflow. + +#include "libm_support.h" + +// Assembly macros +//============================================================== +cosh_FR_X = f44 +cosh_FR_SGNX = f40 + +cosh_FR_Inv_log2by64 = f9 +cosh_FR_log2by64_lo = f11 +cosh_FR_log2by64_hi = f10 + +cosh_FR_A1 = f9 +cosh_FR_A2 = f10 +cosh_FR_A3 = f11 + +cosh_FR_Rcub = f12 +cosh_FR_M_temp = f13 +cosh_FR_R_temp = f13 +cosh_FR_Rsq = f13 +cosh_FR_R = f14 + +cosh_FR_M = f38 + +cosh_FR_B1 = f15 +cosh_FR_B2 = f32 +cosh_FR_B3 = f33 + +cosh_FR_peven_temp1 = f34 +cosh_FR_peven_temp2 = f35 +cosh_FR_peven = f36 + +cosh_FR_podd_temp1 = f34 +cosh_FR_podd_temp2 = f35 +cosh_FR_podd = f37 + +cosh_FR_J_temp = f9 +cosh_FR_J = f10 + +cosh_FR_Mmj = f39 + +cosh_FR_N_temp1 = f11 +cosh_FR_N_temp2 = f12 +cosh_FR_N = f13 + +cosh_FR_spos = f14 +cosh_FR_sneg = f15 + +cosh_FR_Tjhi = f32 +cosh_FR_Tjlo = f33 +cosh_FR_Tmjhi = f34 +cosh_FR_Tmjlo = f35 + +GR_mJ = r35 +GR_J = r36 + +AD_mJ = r38 +AD_J = r39 + +cosh_FR_C_hi = f9 +cosh_FR_C_hi_temp = f10 +cosh_FR_C_lo_temp1 = f11 +cosh_FR_C_lo_temp2 = f12 +cosh_FR_C_lo_temp3 = f13 + +cosh_FR_C_lo = f38 +cosh_FR_S_hi = f39 + +cosh_FR_S_hi_temp1 = f10 +cosh_FR_Y_hi = f11 +cosh_FR_Y_lo_temp = f12 +cosh_FR_Y_lo = f13 +cosh_FR_COSH = f9 + +cosh_FR_X2 = f9 +cosh_FR_X4 = f10 + +cosh_FR_P1 = f14 +cosh_FR_P2 = f15 +cosh_FR_P3 = f32 +cosh_FR_P4 = f33 +cosh_FR_P5 = f34 +cosh_FR_P6 = f35 + +cosh_FR_TINY_THRESH = f9 + +cosh_FR_COSH_temp = f10 +cosh_FR_SCALE = f11 + +cosh_FR_hi_lo = f10 + +cosh_FR_poly_podd_temp1 = f11 +cosh_FR_poly_podd_temp2 = f13 +cosh_FR_poly_peven_temp1 = f11 +cosh_FR_poly_peven_temp2 = f13 + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_cosh_arg_reduction: +ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction) + +double_cosh_p_table: +ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object) + data8 0x8000000000000000, 0x00003FFE + data8 0xAAAAAAAAAAAAAB80, 0x00003FFA + data8 0xB60B60B60B4FE884, 0x00003FF5 + data8 0xD00D00D1021D7370, 0x00003FEF + data8 0x93F27740C0C2F1CC, 0x00003FE9 + data8 0x8FA02AC65BCBD5BC, 0x00003FE2 +ASM_SIZE_DIRECTIVE(double_cosh_p_table) + +double_cosh_ab_table: +ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(double_cosh_ab_table) + +double_cosh_j_table: +ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(double_cosh_j_table) + +.align 32 +.global cosh# + +.section .text +.proc cosh# +.align 32 + +cosh: + +#ifdef _LIBC +.global __ieee754_cosh# +.proc __ieee754_cosh# +__ieee754_cosh: +#endif + +// X NAN? + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p7 = f8, 0xc3 //@snan | @qnan + nop.i 999 +} +;; + + +{ .mfb + nop.m 999 +(p6) fma.d.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + + +// X infinity +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6,p0 = f8, 0x23 //@inf + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fmerge.s f8 = f0,f8 +(p6) br.ret.spnt b0 ;; +} + + + +// Put 0.25 in f9; p6 true if x < 0.25 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s cosh_FR_X = f0,f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s cosh_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(COSH_BY_TBL) +} +;; + + +// COSH_BY_POLY: +// POLY cannot overflow so there is no need to call __libm_error_support +// Get the values of P_x from the table + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_cosh_p_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe cosh_FR_P1 = [r34],16 +(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_P2 = [r34],16 ;; +(p0) ldfe cosh_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_P4 = [r34],16 ;; +(p0) ldfe cosh_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe cosh_FR_P6 = [r34],16 +(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1 + nop.i 999 +} + +// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Y_lo = x2*p_odd + p_even +// Calculate f8 = Y_hi + Y_lo +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.d.s0 f8 = f1, f1, cosh_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + + +L(COSH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) +// we want to leave now. Go to HUGE if |x| >= 2^14 +// 1000d (register-biased) is e = 14 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010009 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(COSH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// cosh(x) = cosh(B+R) +// = cosh(B) cosh(R) + sinh(B) sinh(R) +// cosh(R) can be approximated by 1 + p_even +// sinh(R) can be approximated by p_odd + +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp + nop.i 999 +} +;; + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + ld8 r34 = [r34] +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe cosh_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe cosh_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_cosh_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// M = round-to-integer(x*Inv_log2by64) +// cosh_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in cosh_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe cosh_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = cosh_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = x - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe cosh_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe cosh_FR_A3 = [r34],16 ;; +(p0) ldfe cosh_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_B2 = [r34],16 ;; +(p0) ldfe cosh_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(double_cosh_j_table), gp + nop.i 999 +} +;; + +{ .mfi + ld8 r37 = [r37] +(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp cosh_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp cosh_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub GR_mJ = r40, r36 +(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1 +(p0) adds GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl GR_mJ = GR_mJ, 5 ;; +(p0) add AD_mJ = r37, GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16 +(p0) shl GR_J = GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16 +(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9 +(p0) add AD_J = r37, GR_J ;; +} + +{ .mmi +(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;; +(p0) ldfs cosh_FR_Tjlo = [AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1 +(p7) br.cond.spnt L(COSH_BY_EXP) ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate C_hi +// ****************************************************** +// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi +// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp + nop.i 999 +} + +// ****************************************************** +// Calculate S_hi +// ****************************************************** +// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi +// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// ****************************************************** +// Calculate C_lo +// ****************************************************** +// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi +// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi) +// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo +// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo) +// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2 + nop.i 999 ;; +} + +// ****************************************************** +// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo +// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp +// cosh_FR_COSH = Y_hi + Y_lo + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.d.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + +L(COSH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// f44 = Scale * (Y_hi + Y_lo) +// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// There is a danger of double overflow if N-1 > 0x3fe = 1022 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x00000000000003fe ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// 103FF => 103FF -FFFF = 400(true) +// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest +// double (7FE). So 0 103FF 8000000000000000 is one ulp more than +// largest double in register bias +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x00000000000103ff ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 64 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov r47 = 64 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt __libm_error_region ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov r47 = 64 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt __libm_error_region ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + + +// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +L(COSH_HUGE): + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s0 f44 = f9, cosh_FR_hi_lo, f0 +(p0) mov r47 = 64 +} +;; + +.endp cosh# +ASM_SIZE_DIRECTIVE(cosh#) + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + +.proc __libm_error_region +__libm_error_region: +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_coshf.S b/sysdeps/ia64/fpu/e_coshf.S new file mode 100644 index 0000000000..84130ae9d4 --- /dev/null +++ b/sysdeps/ia64/fpu/e_coshf.S @@ -0,0 +1,1133 @@ +.file "coshf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/16/00 The error tag for coshf overflow changed to 65 (from 64). +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// float = coshf(float) +// input floating point f8 +// output floating point f8 + + +// Overview of operation +//============================================================== +// There are four paths + +// 1. |x| < 0.25 COSH_BY_POLY +// 2. |x| < 32 COSH_BY_TBL +// 3. |x| < 2^14 COSH_BY_EXP +// 4. |x_ >= 2^14 COSH_HUGE + +// For paths 1, and 2 SAFE is always 1. +// For path 4, Safe is always 0. +// SAFE = 1 means we cannot overflow. + +#include "libm_support.h" + +// Assembly macros +//============================================================== +coshf_FR_X = f44 +coshf_FR_SGNX = f40 + +coshf_FR_Inv_log2by64 = f9 +coshf_FR_log2by64_lo = f11 +coshf_FR_log2by64_hi = f10 + +coshf_FR_A1 = f9 +coshf_FR_A2 = f10 +coshf_FR_A3 = f11 + +coshf_FR_Rcub = f12 +coshf_FR_M_temp = f13 +coshf_FR_R_temp = f13 +coshf_FR_Rsq = f13 +coshf_FR_R = f14 + +coshf_FR_M = f38 + +coshf_FR_B1 = f15 +coshf_FR_B2 = f32 +coshf_FR_B3 = f33 + +coshf_FR_peven_temp1 = f34 +coshf_FR_peven_temp2 = f35 +coshf_FR_peven = f36 + +coshf_FR_podd_temp1 = f34 +coshf_FR_podd_temp2 = f35 +coshf_FR_podd = f37 + +coshf_FR_J_temp = f9 +coshf_FR_J = f10 + +coshf_FR_Mmj = f39 + +coshf_FR_N_temp1 = f11 +coshf_FR_N_temp2 = f12 +coshf_FR_N = f13 + +coshf_FR_spos = f14 +coshf_FR_sneg = f15 + +coshf_FR_Tjhi = f32 +coshf_FR_Tjlo = f33 +coshf_FR_Tmjhi = f34 +coshf_FR_Tmjlo = f35 + +GR_mJ = r35 +GR_J = r36 + +AD_mJ = r38 +AD_J = r39 + + +GR_SAVE_B0 = r42 +GR_SAVE_PFS = r41 +GR_SAVE_GP = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 +GR_Parameter_TAG = r47 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f44 + + +coshf_FR_C_hi = f9 +coshf_FR_C_hi_temp = f10 +coshf_FR_C_lo_temp1 = f11 +coshf_FR_C_lo_temp2 = f12 +coshf_FR_C_lo_temp3 = f13 + +coshf_FR_C_lo = f38 +coshf_FR_S_hi = f39 + +coshf_FR_S_hi_temp1 = f10 +coshf_FR_Y_hi = f11 +coshf_FR_Y_lo_temp = f12 +coshf_FR_Y_lo = f13 +coshf_FR_COSH = f9 + +coshf_FR_X2 = f9 +coshf_FR_X4 = f10 + +coshf_FR_P1 = f14 +coshf_FR_P2 = f15 +coshf_FR_P3 = f32 +coshf_FR_P4 = f33 +coshf_FR_P5 = f34 +coshf_FR_P6 = f35 + +coshf_FR_TINY_THRESH = f9 + +coshf_FR_COSH_temp = f10 +coshf_FR_SCALE = f11 + +coshf_FR_hi_lo = f10 + +coshf_FR_poly_podd_temp1 = f11 +coshf_FR_poly_podd_temp2 = f13 +coshf_FR_poly_peven_temp1 = f11 +coshf_FR_poly_peven_temp2 = f13 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +single_coshf_arg_reduction: +ASM_TYPE_DIRECTIVE(single_coshf_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(single_coshf_arg_reduction) + +single_coshf_p_table: +ASM_TYPE_DIRECTIVE(single_coshf_p_table,@object) + data8 0x8000000000000000, 0x00003FFE + data8 0xAAAAAAAAAAAAAB80, 0x00003FFA + data8 0xB60B60B60B4FE884, 0x00003FF5 + data8 0xD00D00D1021D7370, 0x00003FEF + data8 0x93F27740C0C2F1CC, 0x00003FE9 + data8 0x8FA02AC65BCBD5BC, 0x00003FE2 +ASM_SIZE_DIRECTIVE(single_coshf_p_table) + +single_coshf_ab_table: +ASM_TYPE_DIRECTIVE(single_coshf_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(single_coshf_ab_table) + +single_coshf_j_table: +ASM_TYPE_DIRECTIVE(single_coshf_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(single_coshf_j_table) + +.align 32 +.global coshf# + +.section .text +.proc coshf# +.align 32 + +coshf: + +#ifdef _LIBC +.global __ieee754_coshf# +.proc __ieee754_coshf# +__ieee754_coshf: +#endif + +// X NAN? + + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p7 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p6) fma.s.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + +{ .mfi + nop.m 999 + nop.f 999 + nop.i 999 ;; +} + +// X infinity +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6,p0 = f8, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fmerge.s f8 = f0,f8 +(p6) br.ret.spnt b0 ;; +} + +// Put 0.25 in f9; p6 true if x < 0.25 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s coshf_FR_X = f0,f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s coshf_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc p0,p7 = coshf_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(COSH_BY_TBL) ;; +} + + +// COSH_BY_POLY: + +// POLY cannot overflow so there is no need to call __libm_error_support +// Get the values of P_x from the table + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(single_coshf_p_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + +// Calculate coshf_FR_X2 = ax*ax and coshf_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe coshf_FR_P1 = [r34],16 +(p0) fma.s1 coshf_FR_X2 = coshf_FR_X, coshf_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe coshf_FR_P2 = [r34],16 ;; +(p0) ldfe coshf_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe coshf_FR_P4 = [r34],16 ;; +(p0) ldfe coshf_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe coshf_FR_P6 = [r34],16 +(p0) fma.s1 coshf_FR_X4 = coshf_FR_X2, coshf_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate coshf_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_poly_podd_temp1 = coshf_FR_X4, coshf_FR_P5, coshf_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_podd = coshf_FR_X4, coshf_FR_poly_podd_temp1, coshf_FR_P1 + nop.i 999 +} + +// Calculate coshf_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_poly_peven_temp1 = coshf_FR_X4, coshf_FR_P6, coshf_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_poly_peven_temp2 = coshf_FR_X4, coshf_FR_poly_peven_temp1, coshf_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_peven = coshf_FR_X4, coshf_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Y_lo = x2*p_odd + p_even +// Calculate f8 = Y_hi + Y_lo + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_X2, coshf_FR_podd, coshf_FR_peven + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s.s0 f8 = f1, f1, coshf_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + + +L(COSH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) +// we want to leave now. Go to HUGE if |x| >= 2^14 +// 1000d (register-biased) is e = 14 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010006 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc p6,p7 = coshf_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(COSH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// coshf(x) = coshf(B+R) +// = coshf(B) coshf(R) + sinh(B) sinh(R) +// coshf(R) can be approximated by 1 + p_even +// sinh(R) can be approximated by p_odd + +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(single_coshf_arg_reduction), gp + nop.i 999 +} +;; + + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// coshf_FR_spos = 2^(N-1) and coshf_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + ld8 r34 = [r34] +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe coshf_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe coshf_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe coshf_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(single_coshf_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// M = round-to-integer(x*Inv_log2by64) +// coshf_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in coshf_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_M = coshf_FR_X, coshf_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe coshf_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 coshf_FR_M_temp = coshf_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 coshf_FR_M = coshf_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = coshf_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j + +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = x - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 coshf_FR_R_temp = coshf_FR_M, coshf_FR_log2by64_hi, coshf_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe coshf_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 coshf_FR_R = coshf_FR_M, coshf_FR_log2by64_lo, coshf_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe coshf_FR_A3 = [r34],16 ;; +(p0) ldfe coshf_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe coshf_FR_B2 = [r34],16 ;; +(p0) ldfe coshf_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Rsq = coshf_FR_R, coshf_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(single_coshf_j_table), gp + nop.i 999 +} +;; + +{ .mfi + ld8 r37 = [r37] +(p0) fma.s1 coshf_FR_Rcub = coshf_FR_Rsq, coshf_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_peven_temp1 = coshf_FR_Rsq, coshf_FR_B3, coshf_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_peven_temp2 = coshf_FR_Rsq, coshf_FR_peven_temp1, coshf_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_podd_temp1 = coshf_FR_Rsq, coshf_FR_A3, coshf_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp coshf_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_peven = coshf_FR_Rsq, coshf_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_podd_temp2 = coshf_FR_Rsq, coshf_FR_podd_temp1, coshf_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_podd = coshf_FR_podd_temp2, coshf_FR_Rcub, coshf_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp coshf_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub GR_mJ = r40, r36 +(p0) fmerge.se coshf_FR_spos = coshf_FR_N_temp1, f1 +(p0) adds GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl GR_mJ = GR_mJ, 5 ;; +(p0) add AD_mJ = r37, GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe coshf_FR_Tmjhi = [AD_mJ],16 +(p0) shl GR_J = GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs coshf_FR_Tmjlo = [AD_mJ],16 +(p0) fcmp.lt.unc.s1 p6,p7 = coshf_FR_X,f9 +(p0) add AD_J = r37, GR_J ;; +} + +{ .mmi +(p0) ldfe coshf_FR_Tjhi = [AD_J],16 ;; +(p0) ldfs coshf_FR_Tjlo = [AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se coshf_FR_sneg = coshf_FR_N_temp2, f1 +(p7) br.cond.spnt L(COSH_BY_EXP) ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate C_hi +// ****************************************************** +// coshf_FR_C_hi_temp = coshf_FR_sneg * coshf_FR_Tmjhi +// coshf_FR_C_hi = coshf_FR_spos * coshf_FR_Tjhi + (coshf_FR_sneg * coshf_FR_Tmjhi) + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_hi_temp = coshf_FR_sneg, coshf_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi_temp + nop.i 999 +} + +// ****************************************************** +// Calculate S_hi +// ****************************************************** +// coshf_FR_S_hi_temp1 = coshf_FR_sneg * coshf_FR_Tmjhi +// coshf_FR_S_hi = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_S_hi_temp1 = coshf_FR_sneg, coshf_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// ****************************************************** +// Calculate C_lo +// ****************************************************** +// coshf_FR_C_lo_temp1 = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi +// coshf_FR_C_lo_temp2 = coshf_FR_sneg * coshf_FR_Tmjlo + (coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi) +// coshf_FR_C_lo_temp1 = coshf_FR_sneg * coshf_FR_Tmjlo +// coshf_FR_C_lo_temp3 = coshf_FR_spos * coshf_FR_Tjlo + (coshf_FR_sneg * coshf_FR_Tmjlo) +// coshf_FR_C_lo = coshf_FR_C_lo_temp3 + coshf_FR_C_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fms.s1 coshf_FR_C_lo_temp1 = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fms.s1 coshf_FR_S_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_S_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_lo_temp2 = coshf_FR_sneg, coshf_FR_Tmjhi, coshf_FR_C_lo_temp1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_lo_temp1 = coshf_FR_sneg, coshf_FR_Tmjlo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_lo_temp3 = coshf_FR_spos, coshf_FR_Tjlo, coshf_FR_C_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_C_lo = coshf_FR_C_lo_temp3, f1, coshf_FR_C_lo_temp2 + nop.i 999 ;; +} + +// ****************************************************** +// coshf_FR_Y_lo_temp = coshf_FR_C_hi * coshf_FR_peven + coshf_FR_C_lo +// coshf_FR_Y_lo = coshf_FR_S_hi * coshf_FR_podd + coshf_FR_Y_lo_temp +// coshf_FR_COSH = Y_hi + Y_lo + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_C_hi, coshf_FR_peven, coshf_FR_C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_S_hi, coshf_FR_podd, coshf_FR_Y_lo_temp + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s.s0 f8 = coshf_FR_C_hi, f1, coshf_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + + +L(COSH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// f44 = Scale * (Y_hi + Y_lo) +// = coshf_FR_spos * (coshf_FR_Tjhi + coshf_FR_Y_lo) + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_peven, f1, coshf_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// There is a danger of double overflow if N-1 > 0x3fe = 1022 +// There is a danger of single overflow if N-1 > 0x7e = 126 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000000007e ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_Tjhi, coshf_FR_Y_lo_temp, coshf_FR_Tjlo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_COSH_temp = coshf_FR_Y_lo, f1, coshf_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s0 f44 = coshf_FR_spos, coshf_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s2 f43 = coshf_FR_spos, coshf_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// 1 more that the exponent of the largest double (7FE) = 7FF +// 7FF - 3FF = 400 (true); 400 + FFFF = 103FF (register-biased) +// So 0 103FF 8000000000000000 is one ulp more than +// largest double in register bias +// 1 more that the exponent of the largest single (FE) = FF +// FF - 7F = 80 (true); 80 + FFFF = 1007F (register-biased) +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000001007f ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 65 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov GR_Parameter_TAG = 65 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt __libm_error_region ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov GR_Parameter_TAG = 64 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt __libm_error_region ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + + +L(COSH_HUGE): + +// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 coshf_FR_hi_lo = f1, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s0 f44 = f9, coshf_FR_hi_lo, f0 +(p0) mov GR_Parameter_TAG = 65 +} +.endp coshf +ASM_SIZE_DIRECTIVE(coshf) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk.many b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_coshl.S b/sysdeps/ia64/fpu/e_coshl.S new file mode 100644 index 0000000000..97486f6d1d --- /dev/null +++ b/sysdeps/ia64/fpu/e_coshl.S @@ -0,0 +1,1150 @@ +.file "coshl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/23/01 Set inexact flag for large args. +// +// API +//============================================================== +// float = cosh(float) +// double = cosh(double) +// long double = coshl(long double) +// input floating point f8 +// output floating point f8 + + +// Overview of operation +//============================================================== +// There are four paths + +// 1. |x| < 0.25 COSH_BY_POLY +// 2. |x| < 32 COSH_BY_TBL +// 3. |x| < 2^14 COSH_BY_EXP +// 4. |x| >= 2^14 COSH_HUGE + +// For paths 1, and 2 SAFE is always 1. +// For path 4, Safe is always 0. +// SAFE = 1 means we cannot overflow. + +#include "libm_support.h" + +// Assembly macros +//============================================================== +cosh_FR_X = f44 +FR_RESULT = f44 +cosh_FR_SGNX = f40 +cosh_FR_all_ones = f45 + +FR_X = f8 +FR_Y = f0 +cosh_FR_Inv_log2by64 = f9 +cosh_FR_log2by64_lo = f11 +cosh_FR_log2by64_hi = f10 + +cosh_FR_A1 = f9 +cosh_FR_A2 = f10 +cosh_FR_A3 = f11 + +cosh_FR_Rcub = f12 +cosh_FR_M_temp = f13 +cosh_FR_R_temp = f13 +cosh_FR_Rsq = f13 +cosh_FR_R = f14 + +cosh_FR_M = f38 + +cosh_FR_tmp = f15 +cosh_FR_B1 = f15 +cosh_FR_B2 = f32 +cosh_FR_B3 = f33 + +cosh_FR_peven_temp1 = f34 +cosh_FR_peven_temp2 = f35 +cosh_FR_peven = f36 + +cosh_FR_podd_temp1 = f34 +cosh_FR_podd_temp2 = f35 +cosh_FR_podd = f37 + +cosh_FR_J_temp = f9 +cosh_FR_J = f10 + +cosh_FR_Mmj = f39 + +cosh_FR_N_temp1 = f11 +cosh_FR_N_temp2 = f12 +cosh_FR_N = f13 + +cosh_FR_spos = f14 +cosh_FR_sneg = f15 + +cosh_FR_Tjhi = f32 +cosh_FR_Tjlo = f33 +cosh_FR_Tmjhi = f34 +cosh_FR_Tmjlo = f35 + +GR_mJ = r35 +GR_J = r36 + +AD_mJ = r38 +AD_J = r39 + +cosh_GR_all_ones = r40 + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 +GR_Parameter_TAG = r47 + +cosh_FR_C_hi = f9 +cosh_FR_C_hi_temp = f10 +cosh_FR_C_lo_temp1 = f11 +cosh_FR_C_lo_temp2 = f12 +cosh_FR_C_lo_temp3 = f13 + +cosh_FR_C_lo = f38 +cosh_FR_S_hi = f39 + +cosh_FR_S_hi_temp1 = f10 +cosh_FR_Y_hi = f11 +cosh_FR_Y_lo_temp = f12 +cosh_FR_Y_lo = f13 +cosh_FR_COSH = f9 + +cosh_FR_X2 = f9 +cosh_FR_X4 = f10 + +cosh_FR_P1 = f14 +cosh_FR_P2 = f15 +cosh_FR_P3 = f32 +cosh_FR_P4 = f33 +cosh_FR_P5 = f34 +cosh_FR_P6 = f35 + +cosh_FR_TINY_THRESH = f9 + +cosh_FR_COSH_temp = f10 +cosh_FR_SCALE = f11 + +cosh_FR_hi_lo = f10 + +cosh_FR_poly_podd_temp1 = f11 +cosh_FR_poly_podd_temp2 = f13 +cosh_FR_poly_peven_temp1 = f11 +cosh_FR_poly_peven_temp2 = f13 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_cosh_arg_reduction: +ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction) + +double_cosh_p_table: +ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object) + data8 0x8000000000000000, 0x00003FFE + data8 0xAAAAAAAAAAAAAB80, 0x00003FFA + data8 0xB60B60B60B4FE884, 0x00003FF5 + data8 0xD00D00D1021D7370, 0x00003FEF + data8 0x93F27740C0C2F1CC, 0x00003FE9 + data8 0x8FA02AC65BCBD5BC, 0x00003FE2 +ASM_SIZE_DIRECTIVE(double_cosh_p_table) + +double_cosh_ab_table: +ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(double_cosh_ab_table) + +double_cosh_j_table: +ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(double_cosh_j_table) + +.align 32 +.global coshl# + +.section .text +.proc coshl# +.align 32 + +coshl: + +#ifdef _LIBC +.global __ieee754_coshl# +.proc __ieee754_coshl# +__ieee754_coshl: +#endif + +// X NAN? + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p7 = f8, 0xc3 + mov cosh_GR_all_ones = -1 +};; + +// This is more than we need but it is in preparation +// for the values we add for error support. We push three +// addresses on the stack (3*8) = 24 bytes and one tag + +{ .mfb + nop.m 999 +(p6) fma.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + + +// Make constant that will generate inexact when squared +// X infinity +{ .mfi + setf.sig cosh_FR_all_ones = cosh_GR_all_ones +(p0) fclass.m.unc p6,p0 = f8, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fmerge.s f8 = f0,f8 +(p6) br.ret.spnt b0 ;; +} + + + +// Put 0.25 in f9; p6 true if x < 0.25 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s cosh_FR_X = f0,f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s cosh_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(COSH_BY_TBL) +} +;; + + +// COSH_BY_POLY: +// POLY cannot overflow so there is no need to call __libm_error_support +// Get the values of P_x from the table + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_cosh_p_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe cosh_FR_P1 = [r34],16 +(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_P2 = [r34],16 ;; +(p0) ldfe cosh_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_P4 = [r34],16 ;; +(p0) ldfe cosh_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe cosh_FR_P6 = [r34],16 +(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1 + nop.i 999 +} + +// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Y_lo = x2*p_odd + p_even +// Calculate f8 = Y_hi + Y_lo +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s0 f8 = f1, f1, cosh_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + + +L(COSH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// Double Extended +// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) +// we want to leave now. Go to HUGE if |x| >= 2^14 +// 1000d (register-biased) is e = 14 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000001000d ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(COSH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// cosh(x) = cosh(B+R) +// = cosh(B) cosh(R) + sinh(B) sinh(R) +// cosh(R) can be approximated by 1 + p_even +// sinh(R) can be approximated by p_odd + +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp + nop.i 999 +} +;; + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + ld8 r34 = [r34] +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe cosh_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe cosh_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_cosh_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// M = round-to-integer(x*Inv_log2by64) +// cosh_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in cosh_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe cosh_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = cosh_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = x - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe cosh_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe cosh_FR_A3 = [r34],16 ;; +(p0) ldfe cosh_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe cosh_FR_B2 = [r34],16 ;; +(p0) ldfe cosh_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(double_cosh_j_table), gp + nop.i 999 +} +;; + +{ .mfi + ld8 r37 = [r37] +(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp cosh_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp cosh_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub GR_mJ = r40, r36 +(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1 +(p0) adds GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl GR_mJ = GR_mJ, 5 ;; +(p0) add AD_mJ = r37, GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16 +(p0) shl GR_J = GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16 +(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9 +(p0) add AD_J = r37, GR_J ;; +} + +{ .mmi +(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;; +(p0) ldfs cosh_FR_Tjlo = [AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1 +(p7) br.cond.spnt L(COSH_BY_EXP) ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate C_hi +// ****************************************************** +// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi +// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp + nop.i 999 +} + +// ****************************************************** +// Calculate S_hi +// ****************************************************** +// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi +// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// ****************************************************** +// Calculate C_lo +// ****************************************************** +// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi +// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi) +// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo +// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo) +// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2 + nop.i 999 ;; +} + +// ****************************************************** +// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo +// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp +// cosh_FR_COSH = Y_hi + Y_lo + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo +(p0) br.ret.sptk b0 ;; +} + +L(COSH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// f44 = Scale * (Y_hi + Y_lo) +// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo) + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 0x3ffe = 16382 +// There is a danger of double overflow if N-1 > 0x3fe = 1022 +// There is a danger of single overflow if N-1 > 0x7e = 126 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000003ffe ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p7) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0 + nop.i 999 ;; +} + +// 103FF => 103FF -FFFF = 400(true) +// 400 + 3FF = 7FF, which is 1 more than the exponent of the largest +// double (7FE). So 0 103FF 8000000000000000 is one ulp more than +// largest double in register bias + +// 13FFF => 13FFF -FFFF = 4000(true) + +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000013fff ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 63 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov GR_Parameter_TAG = 63 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt __libm_error_region ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov GR_Parameter_TAG = 63 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt __libm_error_region ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + + +// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +L(COSH_HUGE): + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s0 f44 = f9, cosh_FR_hi_lo, f0 +(p0) mov GR_Parameter_TAG = 63 +} +.endp coshl +ASM_SIZE_DIRECTIVE(coshl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_exp.S b/sysdeps/ia64/fpu/e_exp.S new file mode 100644 index 0000000000..06657b9579 --- /dev/null +++ b/sysdeps/ia64/fpu/e_exp.S @@ -0,0 +1,815 @@ +.file "exp.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 3/07/00 exp(inf) = inf but now does NOT call error support +// exp(-inf) = 0 but now does NOT call error support +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 11/30/00 Reworked to shorten main path, widen main path to include all +// args in normal range, and add quick exit for 0, nan, inf. +// 12/05/00 Loaded constants earlier with setf to save 2 cycles. + +// API +//============================================================== +// double exp(double) + +// Overview of operation +//============================================================== +// Take the input x. w is "how many log2/128 in x?" +// w = x * 128/log2 +// n = int(w) +// x = n log2/128 + r + delta + +// n = 128M + index_1 + 2^4 index_2 +// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta + +// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) +// Construct 2^M +// Get 2^(index_1/128) from table_1; +// Get 2^(index_2/8) from table_2; +// Calculate exp(r) by series +// r = x - n (log2/128)_high +// delta = - n (log2/128)_low +// Calculate exp(delta) as 1 + delta + + +// Special values +//============================================================== +// exp(+0) = 1.0 +// exp(-0) = 1.0 + +// exp(+qnan) = +qnan +// exp(-qnan) = -qnan +// exp(+snan) = +qnan +// exp(-snan) = -qnan + +// exp(-inf) = +0 +// exp(+inf) = +inf + +// Overfow and Underfow +//======================= +// exp(-x) = smallest double normal when +// x = -708.396 = c086232bdd7abcd2 + +// exp(x) = largest double normal when +// x = 709.7827 = 40862e42fefa39ef + + + +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f9 -> f15, f32 -> f60 + +// General registers used: +// r32 -> r60 + +// Predicate registers used: +// p6 -> p15 + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +exp_GR_rshf = r33 +EXP_AD_TB1 = r34 +EXP_AD_TB2 = r35 +EXP_AD_P = r36 + +exp_GR_N = r37 +exp_GR_index_1 = r38 +exp_GR_index_2_16 = r39 + +exp_GR_biased_M = r40 +exp_GR_index_1_16 = r41 +EXP_AD_T1 = r42 +EXP_AD_T2 = r43 +exp_GR_sig_inv_ln2 = r44 + +exp_GR_17ones = r45 +exp_GR_one = r46 +exp_TB1_size = r47 +exp_TB2_size = r48 +exp_GR_rshf_2to56 = r49 + +exp_GR_gt_ln = r50 +exp_GR_exp_2tom56 = r51 + +exp_GR_17ones_m1 = r52 + +GR_SAVE_B0 = r53 +GR_SAVE_PFS = r54 +GR_SAVE_GP = r55 +GR_SAVE_SP = r56 + +GR_Parameter_X = r57 +GR_Parameter_Y = r58 +GR_Parameter_RESULT = r59 +GR_Parameter_TAG = r60 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +EXP_RSHF_2TO56 = f6 +EXP_INV_LN2_2TO63 = f7 +EXP_W_2TO56_RSH = f9 +EXP_2TOM56 = f11 +exp_P4 = f12 +exp_P3 = f13 +exp_P2 = f14 +exp_P1 = f15 + +exp_ln2_by_128_hi = f33 +exp_ln2_by_128_lo = f34 + +EXP_RSHF = f35 +EXP_Nfloat = f36 +exp_W = f37 +exp_r = f38 +exp_f = f39 + +exp_rsq = f40 +exp_rcube = f41 + +EXP_2M = f42 +exp_S1 = f43 +exp_T1 = f44 + +EXP_MIN_DBL_OFLOW_ARG = f45 +EXP_MAX_DBL_ZERO_ARG = f46 +EXP_MAX_DBL_NORM_ARG = f47 +EXP_MAX_DBL_UFLOW_ARG = f48 +EXP_MIN_DBL_NORM_ARG = f49 +exp_rP4pP3 = f50 +exp_P_lo = f51 +exp_P_hi = f52 +exp_P = f53 +exp_S = f54 + +EXP_NORM_f8 = f56 + +exp_wre_urm_f8 = f57 +exp_ftz_urm_f8 = f57 + +exp_gt_pln = f58 + +exp_S2 = f59 +exp_T2 = f60 + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** + +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 128/ln(2) is needed for the computation of w. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7) +// This constant is added to x*1/ln2 to shift the integer part of +// x*128/ln2 into the rightmost bits of the significand. +// The result of this fma is EXP_W_2TO56_RSH. +// 2. EXP_RSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from EXP_W_2TO56_RSH * 2^(-56) to give +// the integer part of w, n, as a floating-point number. +// The result of this fms is EXP_Nfloat. + + +exp_table_1: +ASM_TYPE_DIRECTIVE(exp_table_1,@object) +data8 0x40862e42fefa39f0 // smallest dbl overflow arg +data8 0xc0874c0000000000 // approx largest arg for zero result +data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result +data8 0xc086232bdd7abcd3 // largest dbl underflow arg +data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result +data8 0x0 // pad +data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi +data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo + +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 + +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +ASM_SIZE_DIRECTIVE(exp_table_1) + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +exp_table_2: +ASM_TYPE_DIRECTIVE(exp_table_2,@object) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +ASM_SIZE_DIRECTIVE (exp_table_2) + + +exp_p_table: +ASM_TYPE_DIRECTIVE(exp_p_table,@object) +data8 0x3f8111116da21757 //P_4 +data8 0x3fa55555d787761c //P_3 +data8 0x3fc5555555555414 //P_2 +data8 0x3fdffffffffffd6a //P_1 +ASM_SIZE_DIRECTIVE(exp_p_table) + + +.align 32 +.global exp# + +.section .text +.proc exp# +.align 32 +exp: +#ifdef _LIBC +.global __ieee754_exp# +__ieee754_exp: +#endif + +{ .mlx + alloc r32=ar.pfs,1,24,4,0 + movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 +} +{ .mlx + addl EXP_AD_TB1 = @ltoff(exp_table_1), gp + movl exp_GR_rshf_2to56 = 0x4768000000000000 ;; // 1.10000 2^(63+56) +} +;; + +// We do this fnorm right at the beginning to take any enabled +// faults and to normalize any input unnormals so that SWA is not taken. +{ .mfi + ld8 EXP_AD_TB1 = [EXP_AD_TB1] + fclass.m p8,p0 = f8,0x07 // Test for x=0 + mov exp_GR_17ones = 0x1FFFF +} +{ .mfi + mov exp_TB1_size = 0x100 + fnorm EXP_NORM_f8 = f8 + mov exp_GR_exp_2tom56 = 0xffff-56 +} +;; + +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand + +{ .mmf + setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // form 1/ln2 * 2^63 + setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 // Form const 1.100 * 2^(63+56) + fclass.m p9,p0 = f8,0x22 // Test for x=-inf +} +;; + +{ .mlx + setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 // form 2^-56 for scaling Nfloat + movl exp_GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift +} +{ .mfb + mov exp_TB2_size = 0x80 +(p8) fma.d f8 = f1,f1,f0 // quick exit for x=0 +(p8) br.ret.spnt b0 +;; +} + +{ .mfi + ldfpd EXP_MIN_DBL_OFLOW_ARG, EXP_MAX_DBL_ZERO_ARG = [EXP_AD_TB1],16 + fclass.m p10,p0 = f8,0x21 // Test for x=+inf + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.d f8 = f0,f0,f0 // quick exit for x=-inf +(p9) br.ret.spnt b0 +;; +} + +{ .mmf + ldfpd EXP_MAX_DBL_NORM_ARG, EXP_MAX_DBL_UFLOW_ARG = [EXP_AD_TB1],16 + setf.d EXP_RSHF = exp_GR_rshf // Form right shift const 1.100 * 2^63 + fclass.m p11,p0 = f8,0xc3 // Test for x=nan +;; +} + +{ .mfb + ldfd EXP_MIN_DBL_NORM_ARG = [EXP_AD_TB1],16 + nop.f 999 +(p10) br.ret.spnt b0 // quick exit for x=+inf +;; +} + +{ .mfi + ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16 + nop.f 999 + nop.i 999 +;; +} + + +{ .mfb + ldfe exp_ln2_by_128_lo = [EXP_AD_TB1],16 +(p11) fmerge.s f8 = EXP_NORM_f8, EXP_NORM_f8 +(p11) br.ret.spnt b0 // quick exit for x=nan +;; +} + +// After that last load, EXP_AD_TB1 points to the beginning of table 1 + +// W = X * Inv_log2_by_128 +// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. +// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. + +{ .mfi + nop.m 999 + fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8, EXP_INV_LN2_2TO63, EXP_RSHF_2TO56 + nop.i 999 +;; +} + + +// Divide arguments into the following categories: +// Certain Underflow/zero p11 - -inf < x <= MAX_DBL_ZERO_ARG +// Certain Underflow p12 - MAX_DBL_ZERO_ARG < x <= MAX_DBL_UFLOW_ARG +// Possible Underflow p13 - MAX_DBL_UFLOW_ARG < x < MIN_DBL_NORM_ARG +// Certain Safe - MIN_DBL_NORM_ARG <= x <= MAX_DBL_NORM_ARG +// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG +// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf +// +// If the input is really a double arg, then there will never be "Possible +// Underflow" or "Possible Overflow" arguments. +// + +{ .mfi + add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1 + fcmp.ge.s1 p15,p14 = EXP_NORM_f8,EXP_MIN_DBL_OFLOW_ARG + nop.i 999 +;; +} + +{ .mfi + add EXP_AD_P = exp_TB2_size, EXP_AD_TB2 + fcmp.le.s1 p11,p12 = EXP_NORM_f8,EXP_MAX_DBL_ZERO_ARG + nop.i 999 +;; +} + +{ .mfb + ldfpd exp_P4, exp_P3 = [EXP_AD_P] ,16 +(p14) fcmp.gt.unc.s1 p14,p0 = EXP_NORM_f8,EXP_MAX_DBL_NORM_ARG +(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) +;; +} + + +// Nfloat = round_int(W) +// The signficand of EXP_W_2TO56_RSH contains the rounded integer part of W, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into exp_GR_N. + +// Since EXP_W_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield EXP_Nfloat. +// Thus, EXP_Nfloat contains the floating point version of N + + +{ .mfi + nop.m 999 +(p12) fcmp.le.unc p12,p0 = EXP_NORM_f8,EXP_MAX_DBL_UFLOW_ARG + nop.i 999 +} +{ .mfb + ldfpd exp_P2, exp_P1 = [EXP_AD_P] + fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF +(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO) +;; +} + +{ .mfi + getf.sig exp_GR_N = EXP_W_2TO56_RSH +(p13) fcmp.lt.unc p13,p0 = EXP_NORM_f8,EXP_MIN_DBL_NORM_ARG + nop.i 999 +;; +} + + +// exp_GR_index_1 has index_1 +// exp_GR_index_2_16 has index_2 * 16 +// exp_GR_biased_M has M +// exp_GR_index_1_16 has index_1 * 16 + +// r2 has true M +{ .mfi + and exp_GR_index_1 = 0x0f, exp_GR_N + fnma.s1 exp_r = EXP_Nfloat, exp_ln2_by_128_hi, EXP_NORM_f8 + shr r2 = exp_GR_N, 0x7 +} +{ .mfi + and exp_GR_index_2_16 = 0x70, exp_GR_N + fnma.s1 exp_f = EXP_Nfloat, exp_ln2_by_128_lo, f1 + nop.i 999 +;; +} + + +// EXP_AD_T1 has address of T1 +// EXP_AD_T2 has address if T2 + +{ .mmi + addl exp_GR_biased_M = 0xffff, r2 + add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16 + shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1 +;; +} + + +// Create Scale = 2^M +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo + +{ .mmi + setf.exp EXP_2M = exp_GR_biased_M + ldfe exp_T2 = [EXP_AD_T2] + nop.i 999 +;; +} + +// Load T1 and T2 +{ .mfi + ldfe exp_T1 = [EXP_AD_T1] + nop.f 999 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 exp_rsq = exp_r, exp_r, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_rP4pP3 = exp_r, exp_P4, exp_P3 + nop.i 999 +;; +} + + + +{ .mfi + nop.m 999 + fma.s1 exp_rcube = exp_r, exp_rsq, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_P_lo = exp_r, exp_rP4pP3, exp_P2 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 exp_P_hi = exp_rsq, exp_P1, exp_r + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_S2 = exp_f,exp_T2,f0 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 exp_S1 = EXP_2M,exp_T1,f0 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 exp_P = exp_rcube, exp_P_lo, exp_P_hi + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fma.s1 exp_S = exp_S1,exp_S2,f0 + nop.i 999 +;; +} + +{ .bbb +(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) +(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW) +(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW) +;; +} + + +{ .mfb + nop.m 999 + fma.d f8 = exp_S, exp_P, exp_S + br.ret.sptk b0 ;; // Normal path exit +} + + +L(EXP_POSSIBLE_OVERFLOW): + +// We got an answer. EXP_MAX_DBL_NORM_ARG < x < EXP_MIN_DBL_OFLOW_ARG +// overflow is a possibility, not a certainty + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.d.s2 exp_wre_urm_f8 = exp_S, exp_P, exp_S + nop.i 999 ;; +} + +// We define an overflow when the answer with +// WRE set +// user-defined rounding mode +// is ldn +1 + +// Is the exponent 1 more than the largest double? +// If so, go to ERROR RETURN, else get the answer and +// leave. + +// Largest double is 7FE (biased double) +// 7FE - 3FF + FFFF = 103FE +// Create + largest_double_plus_ulp +// Create - largest_double_plus_ulp +// Calculate answer with WRE set. + +// Cases when answer is ldn+1 are as follows: +// ldn ldn+1 +// --+----------|----------+------------ +// | +// +inf +inf -inf +// RN RN +// RZ + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + mov exp_GR_gt_ln = 0x103ff ;; +} + +{ .mfi + setf.exp exp_gt_pln = exp_GR_gt_ln + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln + nop.i 999 ;; +} + +{ .mfb + nop.m 999 + nop.f 999 +(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;; // Branch if really overflow +} + +{ .mfb + nop.m 999 + fma.d f8 = exp_S, exp_P, exp_S + br.ret.sptk b0 ;; // Exit if really no overflow +} + +L(EXP_CERTAIN_OVERFLOW): +{ .mmi + sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;; + setf.exp f9 = exp_GR_17ones_m1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 14 + fma.d FR_RESULT = f9, f9, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region ;; +} + +L(EXP_POSSIBLE_UNDERFLOW): + +// We got an answer. EXP_MAX_DBL_UFLOW_ARG < x < EXP_MIN_DBL_NORM_ARG +// underflow is a possibility, not a certainty + +// We define an underflow when the answer with +// ftz set +// is zero (tiny numbers become zero) + +// Notice (from below) that if we have an unlimited exponent range, +// then there is an extra machine number E between the largest denormal and +// the smallest normal. + +// So if with unbounded exponent we round to E or below, then we are +// tiny and underflow has occurred. + +// But notice that you can be in a situation where we are tiny, namely +// rounded to E, but when the exponent is bounded we round to smallest +// normal. So the answer can be the smallest normal with underflow. + +// E +// -----+--------------------+--------------------+----- +// | | | +// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe +// 0.1...11 2^-3ffe (biased, 1) +// largest dn smallest normal + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x41 + nop.i 999 ;; +} +{ .mfi + nop.m 999 + fma.d.s2 exp_ftz_urm_f8 = exp_S, exp_P, exp_S + nop.i 999 ;; +} +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0 + nop.i 999 ;; +} +{ .mfb + nop.m 999 + nop.f 999 +(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) ;; // Branch if really underflow +} +{ .mfb + nop.m 999 + fma.d f8 = exp_S, exp_P, exp_S + br.ret.sptk b0 ;; // Exit if really no underflow +} + +L(EXP_CERTAIN_UNDERFLOW): +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 15 + fma.d FR_RESULT = exp_S, exp_P, exp_S // Set I,U and tiny result + br.cond.sptk __libm_error_region ;; +} + +L(EXP_CERTAIN_UNDERFLOW_ZERO): +{ .mmi + mov exp_GR_one = 1 ;; + setf.exp f9 = exp_GR_one + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 15 + fma.d FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result + br.cond.sptk __libm_error_region ;; +} + +.endp exp +ASM_SIZE_DIRECTIVE(exp) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_expf.S b/sysdeps/ia64/fpu/e_expf.S new file mode 100644 index 0000000000..1288cb96a2 --- /dev/null +++ b/sysdeps/ia64/fpu/e_expf.S @@ -0,0 +1,768 @@ +.file "expf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + +// History +//============================================================== +// 4/04/00 Unwind update +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case +// 12/07/00 Widen main path, shorten x=inf, nan paths +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== +// integer registers used + + exp_GR_0x0f = r33 + exp_GR_0xf0 = r34 + + EXP_AD_P_1 = r36 + EXP_AD_P_2 = r37 + EXP_AD_T1 = r38 + EXP_AD_T2 = r39 + exp_GR_Mint = r40 + + exp_GR_Mint_p_128 = r41 + exp_GR_Ind1 = r42 + EXP_AD_M1 = r43 + exp_GR_Ind2 = r44 + EXP_AD_M2 = r45 + + exp_GR_min_oflow = r46 + exp_GR_max_zero = r47 + exp_GR_max_norm = r48 + exp_GR_max_uflow = r49 + exp_GR_min_norm = r50 + + exp_GR_17ones = r51 + exp_GR_gt_ln = r52 + exp_GR_T2_size = r53 + + exp_GR_17ones_m1 = r56 + exp_GR_one = r57 + + + +GR_SAVE_B0 = r53 +GR_SAVE_PFS = r55 +GR_SAVE_GP = r54 + +GR_Parameter_X = r59 +GR_Parameter_Y = r60 +GR_Parameter_RESULT = r61 +GR_Parameter_TAG = r62 + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + + +// floating point registers used + + EXP_MIN_SGL_OFLOW_ARG = f11 + EXP_MAX_SGL_ZERO_ARG = f12 + EXP_MAX_SGL_NORM_ARG = f13 + EXP_MAX_SGL_UFLOW_ARG = f14 + EXP_MIN_SGL_NORM_ARG = f15 + + exp_coeff_P5 = f32 + exp_coeff_P6 = f33 + exp_coeff_P3 = f34 + exp_coeff_P4 = f35 + + exp_coeff_P1 = f36 + exp_coeff_P2 = f37 + exp_Mx = f38 + exp_Mfloat = f39 + exp_R = f40 + + exp_P1 = f41 + exp_P2 = f42 + exp_P3 = f43 + exp_Rsq = f44 + exp_R4 = f45 + + exp_P4 = f46 + exp_P5 = f47 + exp_P6 = f48 + exp_P7 = f49 + exp_T1 = f50 + + exp_T2 = f51 + exp_T = f52 + exp_A = f53 + exp_norm_f8 = f54 + exp_wre_urm_f8 = f55 + + exp_ftz_urm_f8 = f56 + exp_gt_pln = f57 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +exp_coeff_1_table: +ASM_TYPE_DIRECTIVE(exp_coeff_1_table,@object) +data8 0x3F56F35FDE4F8563 // p5 +data8 0x3F2A378BEFECCFDD // p6 +data8 0x3FE00000258C581D // p1 +data8 0x3FC555557AE7B3D4 // p2 +ASM_SIZE_DIRECTIVE(exp_coeff_1_table) + + +exp_coeff_2_table: +ASM_TYPE_DIRECTIVE(exp_coeff_2_table,@object) +data8 0x3FA5551BB6592FAE // p3 +data8 0x3F8110E8EBFFD485 // p4 +ASM_SIZE_DIRECTIVE(exp_coeff_2_table) + + +exp_T2_table: +ASM_TYPE_DIRECTIVE(exp_T2_table,@object) +data8 0xa175cf9cd7d85844 , 0x00003f46 // exp(-128) +data8 0xdb7279415a1f9eed , 0x00003f47 // exp(-127) +data8 0x95213b242bd8ca5f , 0x00003f49 // exp(-126) +data8 0xcab03c968c989f83 , 0x00003f4a // exp(-125) +data8 0x89bdb674702961ad , 0x00003f4c // exp(-124) +data8 0xbb35a2eec278be35 , 0x00003f4d // exp(-123) +data8 0xfe71b17f373e7e7a , 0x00003f4e // exp(-122) +data8 0xace9a6ec52a39b63 , 0x00003f50 // exp(-121) +data8 0xeb03423fe393cf1c , 0x00003f51 // exp(-120) +data8 0x9fb52c5bcaef1693 , 0x00003f53 // exp(-119) +data8 0xd910b6377ed60bf1 , 0x00003f54 // exp(-118) +data8 0x9382dad8a9fdbfe4 , 0x00003f56 // exp(-117) +data8 0xc87d0a84dea869a3 , 0x00003f57 // exp(-116) +data8 0x883efb4c6d1087b0 , 0x00003f59 // exp(-115) +data8 0xb92d7373dce9a502 , 0x00003f5a // exp(-114) +data8 0xfbaeb020577fb0cb , 0x00003f5b // exp(-113) +ASM_SIZE_DIRECTIVE(exp_T2_table) + + +exp_T1_table: +ASM_TYPE_DIRECTIVE(exp_T1_table,@object) +data8 0x8000000000000000 , 0x00003fff // exp(16 * 0) +data8 0x87975e8540010249 , 0x00004016 // exp(16 * 1) +data8 0x8fa1fe625b3163ec , 0x0000402d // exp(16 * 2) +data8 0x9826b576512a59d7 , 0x00004044 // exp(16 * 3) +data8 0xa12cc167acbe6902 , 0x0000405b // exp(16 * 4) +data8 0xaabbcdcc279f59e4 , 0x00004072 // exp(16 * 5) +data8 0xb4dbfaadc045d16f , 0x00004089 // exp(16 * 6) +data8 0xbf95e372ccdbf146 , 0x000040a0 // exp(16 * 7) +data8 0xcaf2a62eea10bbfb , 0x000040b7 // exp(16 * 8) +data8 0xd6fbeb62fddbd340 , 0x000040ce // exp(16 * 9) +data8 0xe3bbee32e4a440ea , 0x000040e5 // exp(16 * 10) +data8 0xf13d8517c34199a8 , 0x000040fc // exp(16 * 11) +data8 0xff8c2b166241eedd , 0x00004113 // exp(16 * 12) +data8 0x875a04c0b38d6129 , 0x0000412b // exp(16 * 13) +data8 0x8f610127db6774d7 , 0x00004142 // exp(16 * 14) +data8 0x97e1dd87e5c20bb6 , 0x00004159 // exp(16 * 15) +ASM_SIZE_DIRECTIVE(exp_T1_table) + +// Argument Reduction +// exp_Mx = (int)f8 ==> The value of f8 rounded to int is placed into the +// significand of exp_Mx as a two's +// complement number. + +// Later we want to have exp_Mx in a general register. Do this with a getf.sig +// and call the general register exp_GR_Mint + +// exp_Mfloat = (float)(int)f8 ==> the two's complement number in +// significand of exp_Mx is turned +// into a floating point number. +// R = 1 - exp_Mfloat ==> reduced argument + +// Core Approximation +// Calculate a series in R +// R * p6 + p5 +// R * p4 + p3 +// R * p2 + p1 +// R^2 +// R^4 +// R^2(R * p6 + p5) + (R * p4 + p3) +// R^2(R * p2 + p1) +// R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1)) +// R + 1 +// exp(R) = (1 + R) + R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1)) +// exp(R) = 1 + R + R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6 + +// Reconstruction +// signficand of exp_Mx is two's complement, +// -103 < x < 89 +// The smallest single denormal is 2^-149 = ssdn +// For e^x = ssdn +// x = log(ssdn) = -103.279 +// But with rounding result goes to ssdn until -103.972079 +// The largest single normal is 1.<23 1's> 2^126 ~ 2^127 = lsn +// For e^x = lsn +// x = log(lsn) = 88.7228 +// +// expf overflows when x > 42b17218 = 88.7228 +// expf returns largest single denormal when x = c2aeac50 +// expf goes to zero when x < c2cff1b5 + +// Consider range of 8-bit two's complement, -128 ---> 127 +// Add 128; range becomes 0 ---> 255 + +// The number (=i) in 0 ---> 255 is used as offset into two tables. + +// i = abcd efgh = abcd * 16 + efgh = i1 * 16 + i2 + +// i1 = (exp_GR_Mint + 128) & 0xf0 (show 0xf0 as -0x10 to avoid assembler error) +// (The immediate in the AND is an 8-bit two's complement) +// i1 = i1 + start of T1 table (EXP_AD_T1) +// Note that the entries in T1 are double-extended numbers on 16-byte boundaries +// and that i1 is already shifted left by 16 after the AND. + +// i2 must be shifted left by 4 before adding to the start of the table. +// i2 = ((exp_GR_Mint + 128) & 0x0f) << 4 +// i2 = i2 + start of T2 table (EXP_AD_T2) + +// T = T1 * T2 +// A = T * (1 + R) +// answer = T * (R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6) + +// T * (1 + R) +// = T * exp(R) + + +.global expf# + +.section .text +.proc expf# +.align 32 +expf: +#ifdef _LIBC +.global __ieee754_expf# +__ieee754_expf: +#endif + +{ .mfi + alloc r32 = ar.pfs,1,26,4,0 + fcvt.fx.s1 exp_Mx = f8 + mov exp_GR_17ones = 0x1FFFF +} +{ .mlx + addl EXP_AD_P_1 = @ltoff(exp_coeff_1_table),gp + movl exp_GR_min_oflow = 0x42b17218 +} +;; + +// Fnorm done to take any enabled faults +{ .mfi + ld8 EXP_AD_P_1 = [EXP_AD_P_1] + fclass.m p6,p0 = f8, 0x07 //@zero + nop.i 999 +} +{ .mfi + add exp_GR_max_norm = -1, exp_GR_min_oflow // 0x42b17217 + fnorm exp_norm_f8 = f8 + nop.i 999 +} +;; + +{ .mfi + setf.s EXP_MIN_SGL_OFLOW_ARG = exp_GR_min_oflow // 0x42b17218 + fclass.m p7,p0 = f8, 0x22 // Test for x=-inf + mov exp_GR_0xf0 = 0x0f0 +} +{ .mlx + setf.s EXP_MAX_SGL_NORM_ARG = exp_GR_max_norm + movl exp_GR_max_zero = 0xc2cff1b5 +} +;; + + +{ .mlx + mov exp_GR_0x0f = 0x00f + movl exp_GR_max_uflow = 0xc2aeac50 +} +{ .mfb + nop.m 999 +(p6) fma.s f8 = f1,f1,f0 +(p6) br.ret.spnt b0 // quick exit for x=0 +} +;; + +{ .mfi + setf.s EXP_MAX_SGL_ZERO_ARG = exp_GR_max_zero + fclass.m p8,p0 = f8, 0x21 // Test for x=+inf + adds exp_GR_min_norm = 1, exp_GR_max_uflow // 0xc2aeac51 +} +{ .mfb + ldfpd exp_coeff_P5,exp_coeff_P6 = [EXP_AD_P_1],16 +(p7) fma.s f8 = f0,f0,f0 +(p7) br.ret.spnt b0 // quick exit for x=-inf +} +;; + +{ .mmf + ldfpd exp_coeff_P1,exp_coeff_P2 = [EXP_AD_P_1],16 + setf.s EXP_MAX_SGL_UFLOW_ARG = exp_GR_max_uflow + fclass.m p9,p0 = f8, 0xc3 // Test for x=nan +} +;; + +{ .mmb + ldfpd exp_coeff_P3,exp_coeff_P4 = [EXP_AD_P_1],16 + setf.s EXP_MIN_SGL_NORM_ARG = exp_GR_min_norm +(p8) br.ret.spnt b0 // quick exit for x=+inf +} +;; + +// EXP_AD_P_1 now points to exp_T2_table +{ .mfi + mov exp_GR_T2_size = 0x100 + fcvt.xf exp_Mfloat = exp_Mx + nop.i 999 +} +;; + +{ .mfb + getf.sig exp_GR_Mint = exp_Mx +(p9) fmerge.s f8 = exp_norm_f8, exp_norm_f8 +(p9) br.ret.spnt b0 // quick exit for x=nan +} +;; + +{ .mmi + nop.m 999 + mov EXP_AD_T2 = EXP_AD_P_1 + add EXP_AD_T1 = exp_GR_T2_size,EXP_AD_P_1 ;; +} + + +{ .mmi + adds exp_GR_Mint_p_128 = 0x80,exp_GR_Mint ;; + and exp_GR_Ind1 = exp_GR_Mint_p_128, exp_GR_0xf0 + and exp_GR_Ind2 = exp_GR_Mint_p_128, exp_GR_0x0f ;; +} + +// Divide arguments into the following categories: +// Certain Underflow/zero p11 - -inf < x <= MAX_SGL_ZERO_ARG +// Certain Underflow p12 - MAX_SGL_ZERO_ARG < x <= MAX_SGL_UFLOW_ARG +// Possible Underflow p13 - MAX_SGL_UFLOW_ARG < x < MIN_SGL_NORM_ARG +// Certain Safe - MIN_SGL_NORM_ARG <= x <= MAX_SGL_NORM_ARG +// Possible Overflow p14 - MAX_SGL_NORM_ARG < x < MIN_SGL_OFLOW_ARG +// Certain Overflow p15 - MIN_SGL_OFLOW_ARG <= x < +inf +// +// If the input is really a single arg, then there will never be "Possible +// Underflow" or "Possible Overflow" arguments. +// + +{ .mfi + add EXP_AD_M1 = exp_GR_Ind1,EXP_AD_T1 + fcmp.ge.s1 p15,p14 = exp_norm_f8,EXP_MIN_SGL_OFLOW_ARG + nop.i 999 +} +{ .mfi + shladd EXP_AD_M2 = exp_GR_Ind2,4,EXP_AD_T2 + fms.s1 exp_R = f1,f8,exp_Mfloat + nop.i 999 ;; +} + +{ .mfi + ldfe exp_T1 = [EXP_AD_M1] + fcmp.le.s1 p11,p12 = exp_norm_f8,EXP_MAX_SGL_ZERO_ARG + nop.i 999 ;; +} + +{ .mfb + ldfe exp_T2 = [EXP_AD_M2] +(p14) fcmp.gt.s1 p14,p0 = exp_norm_f8,EXP_MAX_SGL_NORM_ARG +(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;; +} + +{ .mfb + nop.m 999 +(p12) fcmp.le.s1 p12,p0 = exp_norm_f8,EXP_MAX_SGL_UFLOW_ARG +(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO) +} +;; + +{ .mfi + nop.m 999 +(p13) fcmp.lt.s1 p13,p0 = exp_norm_f8,EXP_MIN_SGL_NORM_ARG + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 exp_Rsq = exp_R,exp_R,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_P3 = exp_R,exp_coeff_P2,exp_coeff_P1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 exp_P1 = exp_R,exp_coeff_P6,exp_coeff_P5 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_P2 = exp_R,exp_coeff_P4,exp_coeff_P3 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 exp_P7 = f1,exp_R,f1 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 exp_P5 = exp_Rsq,exp_P3,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_R4 = exp_Rsq,exp_Rsq,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 exp_T = exp_T1,exp_T2,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_P4 = exp_Rsq,exp_P1,exp_P2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 exp_A = exp_T,exp_P7,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 exp_P6 = exp_R4,exp_P4,exp_P5 + nop.i 999 +} +;; + +{ .bbb +(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) +(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW) +(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW) +} +;; + +{ .mfb + nop.m 999 + fma.s f8 = exp_T,exp_P6,exp_A + br.ret.sptk b0 +} +;; + +L(EXP_POSSIBLE_OVERFLOW): + +// We got an answer. EXP_MAX_SGL_NORM_ARG < x < EXP_MIN_SGL_OFLOW_ARG +// overflow is a possibility, not a certainty +// Set wre in s2 and perform the last operation with s2 + +// We define an overflow when the answer with +// WRE set +// user-defined rounding mode +// is lsn +1 + +// Is the exponent 1 more than the largest single? +// If so, go to ERROR RETURN, else (no overflow) get the answer and +// leave. + +// Largest single is FE (biased single) +// FE - 7F + FFFF = 1007E + +// Create + largest_single_plus_ulp +// Create - largest_single_plus_ulp + +// Calculate answer with WRE set. + +// Cases when answer is lsn+1 are as follows: + +// midpoint +// | +// lsn | lsn+1 +// --+----------|----------+------------ +// | +// +inf +inf -inf +// RN RN +// RZ +// exp_gt_pln contains the floating point number lsn+1. +// The setf.exp puts 0x1007f in the exponent and 0x800... in the significand. + +// If the answer is >= lsn+1, we have overflowed. +// Then p6 is TRUE. Set the overflow tag, save input in FR_X, +// do the final calculation for IEEE result, and branch to error return. + +{ .mfi + mov exp_GR_gt_ln = 0x1007F + fsetc.s2 0x7F,0x42 + nop.i 999 +} +;; + +{ .mfi + setf.exp exp_gt_pln = exp_GR_gt_ln + fma.s.s2 exp_wre_urm_f8 = exp_T, exp_P6, exp_A + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln + nop.i 999 +} +;; + +{ .mfb + nop.m 999 + nop.f 999 +(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) // Branch if really overflow +} +;; + +{ .mfb + nop.m 999 + fma.s f8 = exp_T, exp_P6, exp_A + br.ret.sptk b0 // Exit if really no overflow +} +;; + +L(EXP_CERTAIN_OVERFLOW): +{ .mmi + sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;; + setf.exp f9 = exp_GR_17ones_m1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 16 + fma.s FR_RESULT = f9, f9, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region ;; +} + +L(EXP_POSSIBLE_UNDERFLOW): + +// We got an answer. EXP_MAX_SGL_UFLOW_ARG < x < EXP_MIN_SGL_NORM_ARG +// underflow is a possibility, not a certainty + +// We define an underflow when the answer with +// ftz set +// is zero (tiny numbers become zero) + +// Notice (from below) that if we have an unlimited exponent range, +// then there is an extra machine number E between the largest denormal and +// the smallest normal. + +// So if with unbounded exponent we round to E or below, then we are +// tiny and underflow has occurred. + +// But notice that you can be in a situation where we are tiny, namely +// rounded to E, but when the exponent is bounded we round to smallest +// normal. So the answer can be the smallest normal with underflow. + +// E +// -----+--------------------+--------------------+----- +// | | | +// 1.1...10 2^-7f 1.1...11 2^-7f 1.0...00 2^-7e +// 0.1...11 2^-7e (biased, 1) +// largest dn smallest normal + +// If the answer is = 0, we have underflowed. +// Then p6 is TRUE. Set the underflow tag, save input in FR_X, +// do the final calculation for IEEE result, and branch to error return. + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x41 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s.s2 exp_ftz_urm_f8 = exp_T, exp_P6, exp_A + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0 + nop.i 999 +} +;; + +{ .mfb + nop.m 999 + nop.f 999 +(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) // Branch if really underflow +} +;; + +{ .mfb + nop.m 999 + fma.s f8 = exp_T, exp_P6, exp_A + br.ret.sptk b0 // Exit if really no underflow +} +;; + +L(EXP_CERTAIN_UNDERFLOW): +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 17 + fma.s FR_RESULT = exp_T, exp_P6, exp_A // Set I,U and tiny result + br.cond.sptk __libm_error_region ;; +} + +L(EXP_CERTAIN_UNDERFLOW_ZERO): +{ .mmi + mov exp_GR_one = 1 ;; + setf.exp f9 = exp_GR_one + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fmerge.s FR_X = f8,f8 + nop.i 999 +} +{ .mfb + mov GR_Parameter_TAG = 17 + fma.s FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result + br.cond.sptk __libm_error_region ;; +} + +.endp expf +ASM_SIZE_DIRECTIVE(expf) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mfi + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + nop.f 0 + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; + +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_expl.c b/sysdeps/ia64/fpu/e_expl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_expl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_fmod.S b/sysdeps/ia64/fpu/e_fmod.S new file mode 100644 index 0000000000..ae641f4c9b --- /dev/null +++ b/sysdeps/ia64/fpu/e_fmod.S @@ -0,0 +1,538 @@ +.file "fmod.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational +// Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New Algorithm +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/28/00 Set FR_Y to f9 +// +// API +//==================================================================== +// double fmod(double,double); +// +// Overview of operation +//==================================================================== +// fmod(a,b)=a-i*b, +// where i is an integer such that, if b!=0, +// |i|<|a/b| and |a/b-i|<1 +// +// Algorithm +//==================================================================== +// a). if |a|<|b|, return a +// b). get quotient and reciprocal overestimates accurate to +// 33 bits (q2,y2) +// c). if the exponent difference (exponent(a)-exponent(b)) +// is less than 32, truncate quotient to integer and +// finish in one iteration +// d). if exponent(a)-exponent(b)>=32 (q2>=2^32) +// round quotient estimate to single precision (k=RN(q2)), +// calculate partial remainder (a'=a-k*b), +// get quotient estimate (a'*y2), and repeat from c). +// +// Special cases +//==================================================================== +// b=+/-0: return NaN, call libm_error_support +// a=+/-Inf, a=NaN or b=NaN: return NaN +// +// Registers used +//==================================================================== +// Predicate registers: p6-p11 +// General registers: r2,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15 + +#include "libm_support.h" + +.section .text + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + +.proc fmod# +.align 32 +.global fmod# +.align 32 + +fmod: +#ifdef _LIBC +.global __ieee754_fmod +.type __ieee754_fmod,@function +__ieee754_fmod: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f6=|a| + fmerge.s f6=f0,f8 + mov r2 = 0x0ffdd +} + {.mfi + nop.m 0 + // f7=|b| + fmerge.s f7=f0,f9 + nop.i 0;; +} + +{ .mfi + setf.exp f11 = r2 + // (1) y0 + frcpa.s1 f10,p6=f6,f7 + nop.i 0 +} + +// Y +-NAN, +-inf, +-0? p7 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0xe7 + nop.i 999;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999 +} + +// |x| < |y|? Return x p8 +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7 + nop.i 999 ;; +} + +{ .mfi + nop.m 0 + // normalize y (if |x|<|y|) + (p8) fma.s0 f9=f9,f1,f0 + nop.i 0;; +} + + { .mfi + mov r2=0x1001f + // (2) q0=a*y0 + (p6) fma.s1 f13=f6,f10,f0 + nop.i 0 +} +{ .mfi + nop.m 0 + // (3) e0 = 1 - b * y0 + (p6) fnma.s1 f12=f7,f10,f1 + nop.i 0;; +} + + {.mfi + nop.m 0 + // normalize x (if |x|<|y|) + (p8) fma.d.s0 f8=f8,f1,f0 + nop.i 0 +} +{.bbb + (p9) br.cond.spnt L(FMOD_X_NAN_INF) + (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + // if |x|<|y|, return + (p8) br.ret.spnt b0;; +} + + {.mfi + nop.m 0 + // normalize x + fma.s0 f6=f6,f1,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // normalize y + fma.s0 f7=f7,f1,f0 + nop.i 0;; +} + + {.mfi + // f15=2^32 + setf.exp f15=r2 + // (4) q1=q0+e0*q0 + (p6) fma.s1 f13=f12,f13,f13 + nop.i 0 +} +{ .mfi + nop.m 0 + // (5) e1 = e0 * e0 + 2^-34 + (p6) fma.s1 f14=f12,f12,f11 + nop.i 0;; +} +{.mlx + nop.m 0 + movl r2=0x33a00000;; +} +{ .mfi + nop.m 0 + // (6) y1 = y0 + e0 * y0 + (p6) fma.s1 f10=f12,f10,f10 + nop.i 0;; +} +{.mfi + // set f12=1.25*2^{-24} + setf.s f12=r2 + // (7) q2=q1+e1*q1 + (p6) fma.s1 f13=f13,f14,f13 + nop.i 0;; +} +{.mfi + nop.m 0 + fmerge.s f9=f8,f9 + nop.i 0 +} +{ .mfi + nop.m 0 + // (8) y2 = y1 + e1 * y1 + (p6) fma.s1 f10=f14,f10,f10 + // set p6=0, p10=0 + cmp.ne.and p6,p10=r0,r0;; +} + +.align 32 +L(loop53): + {.mfi + nop.m 0 + // compare q2, 2^32 + fcmp.lt.unc.s1 p8,p7=f13,f15 + nop.i 0 +} + {.mfi + nop.m 0 + // will truncate quotient to integer, if exponent<32 (in advance) + fcvt.fx.trunc.s1 f11=f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // if exponent>32, round quotient to single precision (perform in advance) + fma.s.s1 f13=f13,f1,f0 + nop.i 0;; +} + {.mfi + nop.m 0 + // set f12=sgn(a) + (p8) fmerge.s f12=f8,f1 + nop.i 0 +} + {.mfi + nop.m 0 + // normalize truncated quotient + (p8) fcvt.xf f13=f11 + nop.i 0;; +} + { .mfi + nop.m 0 + // calculate remainder (assuming f13=RZ(Q)) + (p7) fnma.s1 f14=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // also if exponent>32, round quotient to single precision + // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) + (p7) fnma.s.s1 f11=f13,f12,f13 + nop.i 0;; +} + + {.mfi + nop.m 0 + // (p8) calculate remainder (82-bit format) + (p8) fnma.s1 f11=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // (p7) calculate remainder (assuming f11=RZ(Q)) + (p7) fnma.s1 f6=f11,f7,f6 + nop.i 0;; +} + + + {.mfi + nop.m 0 + // Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ? + (p8) fcmp.lt.unc.s1 p6,p10=f11,f0 + nop.i 0;; +} + {.mfi + nop.m 0 + // get new quotient estimation: a'*y2 + (p7) fma.s1 f13=f14,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // was f14=RZ(Q) ? (then new remainder f14>=0) + (p7) fcmp.lt.unc.s1 p7,p9=f14,f0 + nop.b 0;; +} + + +.pred.rel "mutex",p6,p10 + {.mfb + nop.m 0 + // add b to estimated remainder (to cover the case when the quotient was overestimated) + // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) + (p6) fma.d.s0 f8=f11,f12,f9 + nop.b 0 +} + {.mfb + nop.m 0 + // calculate remainder (single precision) + // set correct sign of result before returning + (p10) fma.d.s0 f8=f11,f12,f0 + (p8) br.ret.sptk b0;; +} + {.mfi + nop.m 0 + // if f13!=RZ(Q), get alternative quotient estimation: a''*y2 + (p7) fma.s1 f13=f6,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // if f14 was RZ(Q), set remainder to f14 + (p9) mov f6=f14 + br.cond.sptk L(loop53);; +} + + + +L(FMOD_X_NAN_INF): + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FMOD_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m p9,p8=f9,0xc3 + nop.i 0;; +} +{.mfi + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 + nop.i 0 +} +{ .mfi + nop.m 999 + // also set Denormal flag if necessary +(p8) fma.s0 f9=f9,f1,f0 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p8) fma.d f8=f8,f1,f0 + nop.b 999 ;; +} + +{ .mfb + nop.m 999 +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; +} + + +L(FMOD_Y_NAN_INF_ZERO): + +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma.d f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma.d f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FMOD_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +{.mfi + nop.m 0 + // set Invalid + frcpa f12,p0=f0,f0 + nop.i 0 +} +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} + +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f9,f9 +(p0) mov GR_Parameter_TAG = 121 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p0) fma.d f8=f11,f1,f0 +(p0) br.sptk __libm_error_region;; +} + +.endp fmod +ASM_SIZE_DIRECTIVE(fmod) +ASM_SIZE_DIRECTIVE(__ieee754_fmod) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_fmodf.S b/sysdeps/ia64/fpu/e_fmodf.S new file mode 100644 index 0000000000..9ac03a9a5e --- /dev/null +++ b/sysdeps/ia64/fpu/e_fmodf.S @@ -0,0 +1,553 @@ +.file "fmodf.s" +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational +// Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New Algorithm +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/28/00 Set FR_Y to f9 +// +// API +//==================================================================== +// float fmodf(float,float); +// +// Overview of operation +//==================================================================== +// fmod(a,b)=a-i*b, +// where i is an integer such that, if b!=0, +// |i|<|a/b| and |a/b-i|<1 + +// Algorithm +//==================================================================== +// a). if |a|<|b|, return a +// b). get quotient and reciprocal overestimates accurate to +// 33 bits (q2,y2) +// c). if the exponent difference (exponent(a)-exponent(b)) +// is less than 32, truncate quotient to integer and +// finish in one iteration +// d). if exponent(a)-exponent(b)>=32 (q2>=2^32) +// round quotient estimate to single precision (k=RN(q2)), +// calculate partial remainder (a'=a-k*b), +// get quotient estimate (a'*y2), and repeat from c). + +// Special cases +//==================================================================== +// b=+/-0: return NaN, call libm_error_support +// a=+/-Inf, a=NaN or b=NaN: return NaN + +// Registers used +//==================================================================== +// Predicate registers: p6-p11 +// General registers: r2,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15 + +#include "libm_support.h" + +.section .text + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + + +.proc fmodf# +.align 32 +.global fmodf# +.align 32 + +fmodf: +#ifdef _LIBC +.global __ieee754_fmodf +.type __ieee754_fmodf,@function +__ieee754_fmodf: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f6=|a| + fmerge.s f6=f0,f8 + mov r2 = 0x0ffdd +} + {.mfi + nop.m 0 + // f7=|b| + fmerge.s f7=f0,f9 + nop.i 0;; +} + +{ .mfi + setf.exp f11 = r2 + // (1) y0 + frcpa.s1 f10,p6=f6,f7 + nop.i 0 +} + +// eliminate special cases +// Y +-NAN, +-inf, +-0? p7 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0xe7 + nop.i 999;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999 +} + +// |x| < |y|? Return x p8 +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7 + nop.i 999 ;; +} + +{ .mfi + nop.m 0 + // normalize y (if |x|<|y|) + (p8) fma.s0 f9=f9,f1,f0 + nop.i 0;; +} + + { .mfi + mov r2=0x1001f + // (2) q0=a*y0 + (p6) fma.s1 f13=f6,f10,f0 + nop.i 0 +} +{ .mfi + nop.m 0 + // (3) e0 = 1 - b * y0 + (p6) fnma.s1 f12=f7,f10,f1 + nop.i 0;; +} + + {.mfi + nop.m 0 + // normalize x (if |x|<|y|) + (p8) fma.s.s0 f8=f8,f1,f0 + nop.i 0 +} +{.bbb + (p9) br.cond.spnt L(FMOD_X_NAN_INF) + (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + // if |x|<|y|, return + (p8) br.ret.spnt b0;; +} + + {.mfi + nop.m 0 + // normalize x + fma.s0 f6=f6,f1,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // normalize y + fma.s0 f7=f7,f1,f0 + nop.i 0;; +} + + + {.mfi + // f15=2^32 + setf.exp f15=r2 + // (4) q1=q0+e0*q0 + (p6) fma.s1 f13=f12,f13,f13 + nop.i 0 +} +{ .mfi + nop.m 0 + // (5) e1 = e0 * e0 + 2^-34 + (p6) fma.s1 f14=f12,f12,f11 + nop.i 0;; +} +{.mlx + nop.m 0 + movl r2=0x33a00000;; +} +{ .mfi + nop.m 0 + // (6) y1 = y0 + e0 * y0 + (p6) fma.s1 f10=f12,f10,f10 + nop.i 0;; +} +{.mfi + // set f12=1.25*2^{-24} + setf.s f12=r2 + // (7) q2=q1+e1*q1 + (p6) fma.s1 f13=f13,f14,f13 + nop.i 0;; +} +{.mfi + nop.m 0 + fmerge.s f9=f8,f9 + nop.i 0 +} +{ .mfi + nop.m 0 + // (8) y2 = y1 + e1 * y1 + (p6) fma.s1 f10=f14,f10,f10 + // set p6=0, p10=0 + cmp.ne.and p6,p10=r0,r0;; +} + +.align 32 +L(loop24): + {.mfi + nop.m 0 + // compare q2, 2^32 + fcmp.lt.unc.s1 p8,p7=f13,f15 + nop.i 0 +} + {.mfi + nop.m 0 + // will truncate quotient to integer, if exponent<32 (in advance) + fcvt.fx.trunc.s1 f11=f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // if exponent>32, round quotient to single precision (perform in advance) + fma.s.s1 f13=f13,f1,f0 + nop.i 0;; +} + {.mfi + nop.m 0 + // set f12=sgn(a) + (p8) fmerge.s f12=f8,f1 + nop.i 0 +} + {.mfi + nop.m 0 + // normalize truncated quotient + (p8) fcvt.xf f13=f11 + nop.i 0;; +} + { .mfi + nop.m 0 + // calculate remainder (assuming f13=RZ(Q)) + (p7) fnma.s1 f14=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // also if exponent>32, round quotient to single precision + // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) + (p7) fnma.s.s1 f11=f13,f12,f13 + nop.i 0;; +} + + {.mfi + nop.m 0 + // (p8) calculate remainder (82-bit format) + (p8) fnma.s1 f11=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // (p7) calculate remainder (assuming f11=RZ(Q)) + (p7) fnma.s1 f6=f11,f7,f6 + nop.i 0;; +} + + + {.mfi + nop.m 0 + // Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ? + (p8) fcmp.lt.unc.s1 p6,p10=f11,f0 + nop.i 0;; +} + {.mfi + nop.m 0 + // get new quotient estimation: a'*y2 + (p7) fma.s1 f13=f14,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // was f14=RZ(Q) ? (then new remainder f14>=0) + (p7) fcmp.lt.unc.s1 p7,p9=f14,f0 + nop.b 0;; +} + + +.pred.rel "mutex",p6,p10 + {.mfb + nop.m 0 + // add b to estimated remainder (to cover the case when the quotient was overestimated) + // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) + (p6) fma.s.s0 f8=f11,f12,f9 + nop.b 0 +} + {.mfb + nop.m 0 + // calculate remainder (single precision) + // set correct sign of result before returning + (p10) fma.s.s0 f8=f11,f12,f0 + (p8) br.ret.sptk b0;; +} + {.mfi + nop.m 0 + // if f13!=RZ(Q), get alternative quotient estimation: a''*y2 + (p7) fma.s1 f13=f6,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // if f14 was RZ(Q), set remainder to f14 + (p9) mov f6=f14 + br.cond.sptk L(loop24);; +} + + { .mmb + nop.m 0 + nop.m 0 + br.ret.sptk b0;; + } + +L(FMOD_X_NAN_INF): + + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FMOD_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m p9,p8=f9,0xc3 + nop.i 0;; +} +{.mfi + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 + nop.i 0 +} +{ .mfi + nop.m 999 + // also set Denormal flag if necessary +(p8) fma.s0 f9=f9,f1,f0 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p8) fma.s f8=f8,f1,f0 + nop.b 999 ;; +} + +{ .mfb + nop.m 999 +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; +} + + +L(FMOD_Y_NAN_INF_ZERO): + +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma.s f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma.s f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FMOD_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +{.mfi + nop.m 0 + // set Invalid + frcpa f12,p0=f0,f0 + nop.i 999 +} +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} + +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f0,f0 +nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s f8=f11,f1,f0 + nop.i 999;; +} + +L(EXP_ERROR_RETURN): + + +{ .mib + nop.m 0 +(p0) mov GR_Parameter_TAG=122 +(p0) br.sptk __libm_error_region;; +} + +.endp fmodf +ASM_SIZE_DIRECTIVE(fmodf) +ASM_SIZE_DIRECTIVE(__ieee754_fmodf) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support#;; // Call error handling function +} +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_fmodl.S b/sysdeps/ia64/fpu/e_fmodl.S new file mode 100644 index 0000000000..7fbfd43a10 --- /dev/null +++ b/sysdeps/ia64/fpu/e_fmodl.S @@ -0,0 +1,577 @@ +.file "fmodl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational +// Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New Algorithm +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/28/00 Set FR_Y to f9 +// +// API +//==================================================================== +// long double fmodl(long double,long double); +// +// Overview of operation +//==================================================================== +// fmod(a,b)=a-i*b, +// where i is an integer such that, if b!=0, +// |i|<|a/b| and |a/b-i|<1 +// +// Algorithm +//==================================================================== +// a). if |a|<|b|, return a +// b). get quotient and reciprocal overestimates accurate to +// 33 bits (q2,y2) +// c). if the exponent difference (exponent(a)-exponent(b)) +// is less than 32, truncate quotient to integer and +// finish in one iteration +// d). if exponent(a)-exponent(b)>=32 (q2>=2^32) +// round quotient estimate to single precision (k=RN(q2)), +// calculate partial remainder (a'=a-k*b), +// get quotient estimate (a'*y2), and repeat from c). +// +// Registers used +//==================================================================== +// Predicate registers: p6-p11 +// General registers: r2,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15 + +#include "libm_support.h" + +.section .text + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + + +.proc fmodl# +.align 32 +.global fmodl# +.align 32 + +fmodl: +#ifdef _LIBC +.global __ieee754_fmodl +.type __ieee754_fmodl,@function +__ieee754_fmodl: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f6=|a| + fmerge.s f6=f0,f8 + mov r2 = 0x0ffdd +} + {.mfi + getf.sig r29=f9 + // f7=|b| + fmerge.s f7=f0,f9 + nop.i 0;; +} + +{ .mfi + setf.exp f11 = r2 + // (1) y0 + frcpa.s1 f10,p6=f6,f7 + nop.i 0;; +} + +// eliminate special cases +{.mmi +nop.m 0 +nop.m 0 +// y pseudo-zero ? +cmp.eq p7,p10=r29,r0;; +} + +// Y +-NAN, +-inf, +-0? p7 +{ .mfi + nop.m 999 +(p10) fclass.m p7,p10 = f9, 0xe7 + nop.i 999;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p11 = f8, 0xe3 + nop.i 999 +} + +// |x| < |y|? Return x p8 +{ .mfi + nop.m 999 +(p10) fcmp.lt.unc.s1 p8,p0 = f6,f7 + nop.i 999 ;; +} + + { .mfi + mov r2=0x1001f + // (2) q0=a*y0 + (p6) fma.s1 f13=f6,f10,f0 + nop.i 0 +} { .mfi + nop.m 0 + // (3) e0 = 1 - b * y0 + (p6) fnma.s1 f12=f7,f10,f1 + nop.i 0;; +} + +// Y +-NAN, +-inf, +-0? p7 +{ .mfi + nop.m 999 + // pseudo-NaN ? +(p10) fclass.nm p7,p0 = f9, 0xff + nop.i 999 +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 + +{ .mfi + nop.m 999 +(p11) fclass.nm p9,p0 = f8, 0xff + nop.i 999;; +} + +{ .mfi + nop.m 0 + // y denormal ? set D flag (if |x|<|y|) + (p8) fnma.s0 f10=f9,f1,f9 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // normalize x (if |x|<|y|) + (p8) fma.s0 f8=f8,f1,f0 + nop.i 0 +} +{.bbb + (p9) br.cond.spnt L(FMOD_X_NAN_INF) + (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + // if |x|<|y|, return + (p8) br.ret.spnt b0;; +} + + {.mfi + nop.m 0 + // x denormal ? set D flag + fnma.s0 f32=f6,f1,f6 + nop.i 0 +} +{.mfi + nop.m 0 + // y denormal ? set D flag + fnma.s0 f33=f7,f1,f7 + nop.i 0;; +} + + {.mfi + // f15=2^32 + setf.exp f15=r2 + // (4) q1=q0+e0*q0 + (p6) fma.s1 f13=f12,f13,f13 + nop.i 0 +} +{ .mfi + nop.m 0 + // (5) e1 = e0 * e0 + 2^-34 + (p6) fma.s1 f14=f12,f12,f11 + nop.i 0;; +} +{.mlx + nop.m 0 + movl r2=0x33a00000;; +} +{ .mfi + nop.m 0 + // (6) y1 = y0 + e0 * y0 + (p6) fma.s1 f10=f12,f10,f10 + nop.i 0;; +} +{.mfi + // set f12=1.25*2^{-24} + setf.s f12=r2 + // (7) q2=q1+e1*q1 + (p6) fma.s1 f13=f13,f14,f13 + nop.i 0;; +} +{.mfi + nop.m 0 + fmerge.s f9=f8,f9 + nop.i 0 +} +{ .mfi + nop.m 0 + // (8) y2 = y1 + e1 * y1 + (p6) fma.s1 f10=f14,f10,f10 + // set p6=0, p10=0 + cmp.ne.and p6,p10=r0,r0;; +} + + +.align 32 +L(loop64): + {.mfi + nop.m 0 + // compare q2, 2^32 + fcmp.lt.unc.s1 p8,p7=f13,f15 + nop.i 0 +} + {.mfi + nop.m 0 + // will truncate quotient to integer, if exponent<32 (in advance) + fcvt.fx.trunc.s1 f11=f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // if exponent>32, round quotient to single precision (perform in advance) + fma.s.s1 f13=f13,f1,f0 + nop.i 0;; +} + + + {.mfi + nop.m 0 + // set f12=sgn(a) + (p8) fmerge.s f12=f8,f1 + nop.i 0 +} + {.mfi + nop.m 0 + // normalize truncated quotient + (p8) fcvt.xf f13=f11 + nop.i 0;; +} + { .mfi + nop.m 0 + // calculate remainder (assuming f13=RZ(Q)) + (p7) fnma.s1 f14=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // also if exponent>32, round quotient to single precision + // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) + (p7) fnma.s.s1 f11=f13,f12,f13 + nop.i 0;; +} + + {.mfi + nop.m 0 + // (p8) calculate remainder (82-bit format) + (p8) fnma.s1 f11=f13,f7,f6 + nop.i 0 +} + {.mfi + nop.m 0 + // (p7) calculate remainder (assuming f11=RZ(Q)) + (p7) fnma.s1 f6=f11,f7,f6 + nop.i 0;; +} + + + {.mfi + nop.m 0 + // Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ? + (p8) fcmp.lt.unc.s1 p6,p10=f11,f0 + nop.i 0;; +} + {.mfi + nop.m 0 + // get new quotient estimation: a'*y2 + (p7) fma.s1 f13=f14,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // was f13=RZ(Q) ? (then new remainder f14>=0) + (p7) fcmp.lt.unc.s1 p7,p9=f14,f0 + nop.b 0;; +} + + +.pred.rel "mutex",p6,p10 + {.mfb + nop.m 0 + // add b to estimated remainder (to cover the case when the quotient was overestimated) + // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) + (p6) fma.s0 f8=f11,f12,f9 + nop.b 0 +} + {.mfb + nop.m 0 + // set correct sign of result before returning: f12=sgn(a) + (p10) fma.s0 f8=f11,f12,f0 + (p8) br.ret.sptk b0;; +} + {.mfi + nop.m 0 + // if f13!=RZ(Q), get alternative quotient estimation: a''*y2 + (p7) fma.s1 f13=f6,f10,f0 + nop.i 0 +} + {.mfb + nop.m 0 + // if f14 was RZ(Q), set remainder to f14 + (p9) mov f6=f14 + br.cond.sptk L(loop64);; +} + + + +L(FMOD_X_NAN_INF): + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FMOD_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + // set p7 t0 0 + cmp.ne p7,p0=r0,r0 +(p0) fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m p9,p8=f9,0xc3 + nop.i 0;; +} +// Y not pseudo-zero ? (r29 holds significand) +{.mii + nop.m 999 +(p8) cmp.ne p7,p0=r29,r0 + nop.i 0;; +} +{.mfi + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 + nop.i 0 +} +{ .mfi + nop.m 999 + // also set Denormal flag if necessary +(p7) fnma.s0 f9=f9,f1,f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p8) fma.s0 f8=f8,f1,f0 + nop.b 999 ;; +} + +{ .mfb + nop.m 999 +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; +} + + +L(FMOD_Y_NAN_INF_ZERO): +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f9, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p0 = f9, 0xff + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FMOD_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +{.mfi + nop.m 0 + // set Invalid + frcpa f12,p0=f0,f0 + nop.i 0 +} +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} + + +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f9,f9 +(p0) mov GR_Parameter_TAG = 120 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p0) fma f8=f11,f1,f0 +(p0) br.sptk __libm_error_region;; +} + +.endp fmodl +ASM_SIZE_DIRECTIVE(fmodl) +ASM_SIZE_DIRECTIVE(__ieee754_fmodl) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_hypot.S b/sysdeps/ia64/fpu/e_hypot.S new file mode 100644 index 0000000000..2fc9633567 --- /dev/null +++ b/sysdeps/ia64/fpu/e_hypot.S @@ -0,0 +1,438 @@ +.file "hypot.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 2/02/00 hand-optimized +// 4/04/00 Unwind support added +// 6/20/00 new version +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// ___________ +// Function: hypot(x,y) = |(x^2 + y^2) = for double precision values +// x and y +// Also provides cabs functionality. +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9 (Input) +// f6 -f15, f32-f34 +// +// General Purpose Registers: +// r2,r3,r29 (Scratch) +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6 - p10 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// Overflow can occur. +// hypot(Infinity and anything) = +Infinity +// hypot(QNaN and anything) = QNaN +// hypot(SNaN and anything ) = QNaN +// +// ********************************************************************* +// +// Implementation: +// x2 = x * x in double-extended +// y2 = y * y in double-extended +// temp = x2 + y2 in double-extended +// sqrt(temp) rounded to double +// +// ********************************************************************* + +#include "libm_support.h" + +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 + +FR_X = f32 +FR_Y = f33 +FR_RESULT = f8 + +.section .text +#ifndef _LIBC +.proc cabs# +.global cabs# +cabs: +.endp cabs +#endif +.proc hypot# +.global hypot# +.align 64 + +hypot: +#ifdef _LIBC +.global __hypot +__hypot: +.global __ieee754_hypot +__ieee754_hypot: +#endif +{.mfi + alloc r32= ar.pfs,0,4,4,0 + // Compute x*x + fma.s1 f10=f8,f8,f0 + // r2=bias-1 + mov r2=0xfffe +} +{.mfi + // 63/8 + mov r3=0x40fc //0000 + // y*y + fma.s1 f11=f9,f9,f0 + // r29=429/16 + mov r29=0x41d68;; //000 +} + +{ .mfi + nop.m 0 +// Check if x is an Inf - if so return Inf even +// if y is a NaN (C9X) + fclass.m.unc p7, p6 = f8, 0x023 + shl r3=r3,16 +} +{.mfi + nop.m 0 + // if possible overflow, copy f8 to f32 + // set Denormal, if necessary + // (p8) + fma.d.s0 f32=f8,f1,f0 + nop.i 0;; +} +{ .mfi + nop.m 0 +// Check if y is an Inf - if so return Inf even +// if x is a NaN (C9X) + fclass.m.unc p8, p9 = f9, 0x023 + shl r29=r29,12 +} +{ .mfb + // f7=0.5 + setf.exp f7=r2 +// For x=inf, multiply y by 1 to raise invalid on y an SNaN +// (p7) fma.s0 f9=f9,f1,f0 + // copy f9 to f33; set Denormal, if necessary + fma.d.s0 f33=f9,f1,f0 + nop.b 0;; +} +{.mfb + // f13=63/8 + setf.s f13=r3 + // is y Zero ? + (p6) fclass.m p6,p0=f9,0x7 + nop.b 0 +} +{.mlx + nop.m 0 + movl r2=0x408c0000;; +} + +{.mfi + // f34=429/16 + setf.s f34=r29 + // is x Zero ? + (p9) fclass.m p9,p0=f8,0x7 + // 231/16 + mov r3=0x4167;; //0000 +} +{.mfi + nop.m 0 + // a=x2+y2 + fma.s1 f12=f10,f1,f11 + nop.i 0;; +} +{.mfi + nop.m 0 + // y not NaN ? + (p9) fclass.m p8,p0=f9,0x3f + shl r3=r3,16 +} +{.mfi + nop.m 0 + // f6=2 + fma.s1 f6=f1,f1,f1 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // x not NaN ? + (p6) fclass.m p7,p0=f8,0x3f + nop.i 0;; +} +{.mfi + // f9=35/8 + setf.s f9=r2 + nop.f 0 + // 2*emax-2 + mov r2=0x107fb;; +} + +{.mfb + nop.m 0 + // if f8=Infinity or f9=Zero, return |f8| + (p7) fmerge.s f8=f0,f32 + (p7) br.ret.spnt b0 +} +{.mfb + nop.m 0 + // if f9=Infinity or f8=Zero, return |f9| + (p8) fmerge.s f8=f0,f33 + (p8) br.ret.spnt b0;; +} + + +{.mfi + // f10 =231/16 + setf.s f10=r3 + // z0=frsqrta(a) + frsqrta.s1 f8,p6=f12 + nop.i 0;; +} + +{ .mfi + nop.m 0 +// Identify Natvals, Infs, NaNs, and Zeros +// and return result + fclass.m.unc p7, p0 = f12, 0x1E7 + nop.i 0;; +} +{.mfb + // get exponent of x^2+y^2 + getf.exp r3=f12 + // if special case, set f8 + (p7) mov f8=f12 + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // S0=a*z0 + (p6) fma.s1 f14=f12,f8,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // H0=0.5*z0 + (p6) fma.s1 f15=f8,f7,f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // f6=5/2 + fma.s1 f6=f7,f1,f6 + nop.i 0 +} +{.mfi + nop.m 0 + // f11=3/2 + fma.s1 f11=f7,f1,f1 + nop.i 0;; +} + +{.mfi + nop.m 0 + // d=0.5-S0*H0 + (p6) fnma.s1 f7=f14,f15,f7 + nop.i 0;; +} + +{.mfi + nop.m 0 + // P67=231/16+429/16*d + (p6) fma.s1 f10=f34,f7,f10 + nop.i 0 +} +{.mfi + nop.m 0 + // P45=63/8*d+35/8 + (p6) fma.s1 f9=f13,f7,f9 + nop.i 0;; +} +{.mfi + nop.m 0 + // P23=5/2*d+3/2 + (p6) fma.s1 f11=f6,f7,f11 + nop.i 0 +} +{.mfi + nop.m 0 + // d2=d*d + (p6) fma.s1 f13=f7,f7,f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // P47=d2*P67+P45 + (p6) fma.s1 f10=f10,f13,f9 + nop.i 0 +} +{.mfi + nop.m 0 + // P13=d*P23+1 + (p6) fma.s1 f11=f11,f7,f1 + nop.i 0;; +} +{.mfi + nop.m 0 + // d3=d2*d + (p6) fma.s1 f13=f13,f7,f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // T0=d*S0 + (p6) fma.s1 f15=f7,f14,f0 + nop.i 0 +} +{.mfi + // Is x^2 + y^2 well less than the overflow + // threshold? + (p6) cmp.lt.unc p7, p8 = r3,r2 + // P=P13+d3*P47 + (p6) fma.s1 f10=f13,f10,f11 + nop.i 0;; +} + +{.mfb + nop.m 0 + // S=P*T0+S0 + fma.d.s0 f8=f10,f15,f14 + // No overflow in this case + (p7) br.ret.sptk b0;; +} + +{ .mfi + nop.m 0 +(p8) fsetc.s2 0x7F,0x42 + // Possible overflow path, must detect by + // Setting widest range exponent with prevailing + // rounding mode. + nop.i 0 ;; +} + + +{ .mfi + // bias+0x400 (bias+EMAX+1) + (p8) mov r2=0x103ff + // S=P*T0+S0 + (p8) fma.d.s2 f12=f10,f15,f14 + nop.i 0 ;; +} +{ .mfi +(p8) setf.exp f11 = r2 +(p8) fsetc.s2 0x7F,0x40 +// Restore Original Mode in S2 + nop.i 0 ;; +} +{ .mfi + nop.m 0 +(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11 + nop.i 0 ;; +} +{ .mib + nop.m 0 + mov GR_Parameter_TAG = 46 + // No overflow +(p9) br.ret.sptk b0;; +} +.endp hypot +ASM_SIZE_DIRECTIVE(hypot) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_hypotf.S b/sysdeps/ia64/fpu/e_hypotf.S new file mode 100644 index 0000000000..18a5e32d1c --- /dev/null +++ b/sysdeps/ia64/fpu/e_hypotf.S @@ -0,0 +1,394 @@ +.file "hypotf.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 2/02/00 hand-optimized +// 4/04/00 Unwind support added +// 6/26/00 new version +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// ___________ +// Function: hypotf(x,y) = |(x^2 + y^2) = for single precision values +// x and y +// Also provides cabsf functionality. +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9 (Input) +// f6 -f15 +// +// General Purpose Registers: +// r2-r3 (Scratch) +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6 - p10 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// Overflow can occur. +// hypotf(Infinity and anything) = +Infinity +// hypotf(QNaN and anything) = QNaN +// hypotf(SNaN and anything ) = QNaN +// +// ********************************************************************* +// +// Implementation: +// x2 = x * x in double-extended +// y2 = y * y in double-extended +// temp = x2 + y2 in double-extended +// sqrt(temp) rounded to single precision +// +// ********************************************************************* + +#include "libm_support.h" + +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 + +FR_X = f14 +FR_Y = f15 +FR_RESULT = f8 + +.section .text +#ifndef _LIBC +.proc cabsf# +.global cabsf# +cabsf: +.endp cabsf +#endif +.proc hypotf# +.global hypotf# +.align 64 + +hypotf: +#ifdef _LIBC +.global __hypotf +__hypotf: +.global __ieee754_hypotf +__ieee754_hypotf: +#endif +{.mfi + alloc r32= ar.pfs,0,4,4,0 + // Compute x*x + fma.s1 f10=f8,f8,f0 + // r2=bias-1 + mov r2=0xfffe +} +{.mfi + nop.m 0 + // y*y + fma.s1 f11=f9,f9,f0 + nop.i 0;; +} + +{ .mfi + nop.m 0 +// Check if x is an Inf - if so return Inf even +// if y is a NaN (C9X) + fclass.m.unc p7, p6 = f8, 0x023 + nop.i 0 +} +{.mfi + nop.m 0 + // if possible overflow, copy f8 to f14 + // set Denormal, if necessary + // (p8) + fma.s.s0 f14=f8,f1,f0 + nop.i 0;; +} + +{ .mfi + nop.m 0 +// Check if y is an Inf - if so return Inf even +// if x is a NaN (C9X) + fclass.m.unc p8, p9 = f9, 0x023 + nop.i 0 +} +{ .mfi + nop.m 0 +// For x=inf, multiply y by 1 to raise invalid on y an SNaN +// (p7) fma.s0 f9=f9,f1,f0 + // copy f9 to f15; set Denormal, if necessary + fma.s.s0 f15=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + // is y Zero ? + (p6) fclass.m p6,p0=f9,0x7 + nop.i 0;; +} +{.mfi + nop.m 0 + // is x Zero ? + (p9) fclass.m p9,p0=f8,0x7 + nop.i 0;; +} + +{.mfi + // f7=0.5 + setf.exp f7=r2 + // a=x2+y2 + fma.s1 f12=f10,f1,f11 + nop.i 0;; +} + +{.mfi + nop.m 0 + // x not NaN ? + (p6) fclass.m p7,p0=f8,0x3f + nop.i 0 +} +{.mfi + // 2*emax-2 + mov r2=0x100fb + // f6=2 + fma.s1 f6=f1,f1,f1 + nop.i 0;; +} + +{.mfi + nop.m 0 + // y not NaN ? + (p9) fclass.m p8,p0=f9,0x3f + nop.i 0;; +} + +{.mfb + nop.m 0 + // if f8=Infinity or f9=Zero, return |f8| + (p7) fmerge.s f8=f0,f14 + (p7) br.ret.spnt b0 +} +{.mfb + nop.m 0 + // if f9=Infinity or f8=Zero, return |f9| + (p8) fmerge.s f8=f0,f15 + (p8) br.ret.spnt b0;; +} + +{ .mfi + nop.m 0 +// Identify Natvals, Infs, NaNs, and Zeros +// and return result + fclass.m.unc p7, p0 = f12, 0x1E7 + nop.i 0 +} +{.mfi + nop.m 0 + // z0=frsqrta(a) + frsqrta.s1 f8,p6=f12 + nop.i 0;; +} + +{.mfb + // get exponent of x^2+y^2 + getf.exp r3=f12 + // if special case, set f8 + (p7) mov f8=f12 + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // S0=a*z0 + (p6) fma.s1 f12=f12,f8,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // H0=0.5*z0 + (p6) fma.s1 f10=f8,f7,f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // f6=5/2 + fma.s1 f6=f7,f1,f6 + nop.i 0 +} +{.mfi + nop.m 0 + // f11=3/2 + fma.s1 f11=f7,f1,f1 + nop.i 0;; +} + +{.mfi + nop.m 0 + // d=0.5-S0*H0 + (p6) fnma.s1 f7=f12,f10,f7 + nop.i 0;; +} + +{.mfi + nop.m 0 + // P01=d+1 + (p6) fma.s1 f10=f1,f7,f1 + nop.i 0 +} +{.mfi + nop.m 0 + // P23=5/2*d+3/2 + (p6) fma.s1 f11=f6,f7,f11 + nop.i 0;; +} +{.mfi + nop.m 0 + // d2=d*d + (p6) fma.s1 f7=f7,f7,f0 + nop.i 0;; +} + + +{.mfi + // Is x^2 + y^2 well less than the overflow + // threshold? + (p6) cmp.lt.unc p7, p8 = r3,r2 + // P=P01+d2*P23 + (p6) fma.s1 f10=f7,f11,f10 + nop.i 0;; +} + +{.mfb + nop.m 0 + // S=P*S0 + fma.s.s0 f8=f10,f12,f0 + // No overflow in this case + (p7) br.ret.sptk b0;; +} + +{ .mfi + nop.m 0 +(p8) fsetc.s2 0x7F,0x42 + // Possible overflow path, must detect by + // Setting widest range exponent with prevailing + // rounding mode. + nop.i 0 ;; +} + + +{ .mfi + // bias+0x400 (bias+EMAX+1) + (p8) mov r2=0x1007f + // S=P*S0 + (p8) fma.s.s2 f12=f10,f12,f0 + nop.i 0 ;; +} +{ .mfi +(p8) setf.exp f11 = r2 +(p8) fsetc.s2 0x7F,0x40 +// Restore Original Mode in S2 + nop.i 0 ;; +} +{ .mfi + nop.m 0 +(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11 + nop.i 0 ;; +} +{ .mib + nop.m 0 + mov GR_Parameter_TAG = 47 + // No overflow +(p9) br.ret.sptk b0;; +} +.endp hypotf +ASM_SIZE_DIRECTIVE(hypotf) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mii + add GR_Parameter_Y=-32,sp // Parameter 2 value +(p0) mov GR_Parameter_TAG = 47 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_hypotl.S b/sysdeps/ia64/fpu/e_hypotl.S new file mode 100644 index 0000000000..54ca849737 --- /dev/null +++ b/sysdeps/ia64/fpu/e_hypotl.S @@ -0,0 +1,478 @@ +.file "hypotl.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 2/02/00 hand-optimized +// 4/04/00 Unwind support added +// 6/20/00 new version +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// ___________ +// Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values +// x and y +// Also provides cabsl functionality. +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9 (Input) +// f6 -f15, f32-f34 +// +// General Purpose Registers: +// r2-r3 (Scratch) +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6 - p10 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// Overflow can occur. +// hypotl(Infinity and anything) = +Infinity +// hypotl(QNaN and anything) = QNaN +// hypotl(SNaN and anything ) = QNaN +// +// ********************************************************************* +// +// Implementation: +// x2 = x * x in double-extended +// y2 = y * y in double-extended +// temp = x2 + y2 in double-extended +// sqrt(temp) rounded to double extended +// +// ********************************************************************* + +#include "libm_support.h" + +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 + +FR_X = f32 +FR_Y = f33 +FR_RESULT = f8 + +.section .text +#ifndef _LIBC +.proc cabsl# +.global cabsl# +cabsl: +.endp cabsl +#endif +.proc hypotl# +.global hypotl# +.align 64 + +hypotl: +#ifdef _LIBC +.global __hypotl +__hypotl: +.global __ieee754_hypotl +__ieee754_hypotl: +#endif +{.mfi + alloc r32= ar.pfs,0,4,4,0 + // Compute x*x + fma.s1 f10=f8,f8,f0 + // r2=bias-1 + mov r2=0xfffe +} +{.mfi + nop.m 0 + // y*y + fma.s1 f11=f9,f9,f0 + nop.i 0;; +} + +{ .mfi + nop.m 0 +// Check if x is an Inf - if so return Inf even +// if y is a NaN (C9X) + fclass.m.unc p7, p6 = f8, 0x023 + nop.i 0 +} +{.mfi + nop.m 0 + // if possible overflow, copy f8 to f32 + // set Denormal, if necessary + // (p8) + fma.s0 f32=f8,f1,f0 + nop.i 0;; +} +{ .mfi + nop.m 0 +// Check if y is an Inf - if so return Inf even +// if x is a NaN (C9X) + fclass.m.unc p8, p9 = f9, 0x023 + nop.i 0 +} +{ .mfi + nop.m 999 +// For x=inf, multiply y by 1 to raise invalid on y an SNaN +// (p7) fma.s0 f9=f9,f1,f0 + // copy f9 to f33; set Denormal, if necessary + fma.s0 f33=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + // is y Zero ? + (p6) fclass.m p6,p0=f9,0x7 + nop.i 0;; +} + +{.mfi + // f7=0.5 + setf.exp f7=r2 + // a=x2+y2 + fma.s1 f12=f10,f1,f11 + nop.i 0 +} +{.mfi + mov r2=0x408c //0000 + // dx=x*x-x2 + fms.s1 f13=f8,f8,f10 + nop.i 0;; +} +{.mfi + nop.m 0 + // is x Zero ? + (p9) fclass.m p9,p0=f8,0x7 + shl r2=r2,16 +} +{.mfi + nop.m 0 + // dy=y*y-y2 + fms.s1 f14=f9,f9,f11 + nop.i 0;; +} + +{.mfi + nop.m 0 + // x not NaN ? + (p6) fclass.m p7,p0=f8,0x3f + nop.i 0 +} +{.mfi + nop.m 0 + // f6=2 + fma.s1 f6=f1,f1,f1 + nop.i 0;; +} + +{.mfi + nop.m 0 + // f34=min(x2,y2) + famin.s1 f34=f10,f11 + nop.i 0 +} +{.mfb + nop.m 0 + // f10=max(x2,y2) + famax.s1 f10=f11,f10 + nop.b 0;; // +} + +{.mfi + nop.m 0 + // y not NaN ? + (p9) fclass.m p8,p0=f9,0x3f + nop.i 0;; +} +{.mfb + // f9=35/8 + setf.s f9=r2 + // if f8=Infinity or f9=Zero, return |f8| + (p7) fmerge.s f8=f0,f32 + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // z0=frsqrta(a) + frsqrta.s1 f8,p6=f12 + nop.i 0;; +} +{ .mfi + nop.m 0 +// Identify Natvals, Infs, NaNs, and Zeros +// and return result + fclass.m.unc p7, p0 = f12, 0x1E7 + nop.i 0 +} +{.mfi + // get exponent of x^2+y^2 + getf.exp r3=f12 + // dxy=dx+dy + fma.s1 f13=f13,f1,f14 + nop.i 0;; +} + +{.mfb + // 2*emax-2 + mov r2=0x17ffb + // if f9=Infinity or f8=Zero, return |f9| + (p8) fmerge.s f8=f0,f33 + (p8) br.ret.spnt b0 +} +{.mfi + nop.m 0 + // dd=a-max(x2,y2) + fnma.s1 f10=f10,f1,f12 + nop.i 0;; +} + +{.mfi + nop.m 0 + // S0=a*z0 + (p6) fma.s1 f14=f12,f8,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // H0=0.5*z0 + (p6) fma.s1 f15=f8,f7,f0 + nop.i 0;; +} + +{.mfb + nop.m 0 + // if special case, set f8 + (p7) mov f8=f12 + (p7) br.ret.spnt b0 +} +{.mfi + nop.m 0 + // da=min(x2,y2)-dd + fnma.s1 f10=f10,f1,f34 + nop.i 0;; +} +{.mfi + nop.m 0 + // f6=5/2 + fma.s1 f6=f7,f1,f6 + nop.i 0 +} +{.mfi + nop.m 0 + // f11=3/2 + fma.s1 f11=f7,f1,f1 + nop.i 0;; +} + +{.mfi + nop.m 0 + // d=0.5-S0*H0 + (p6) fnma.s1 f7=f14,f15,f7 + nop.i 0;; +} + +{.mfi + nop.m 0 + // P1=3/2*d+1 + (p6) fma.s1 f11=f11,f7,f1 + nop.i 0 +} +{.mfi + nop.m 0 + // P2=35/8*d+5/2 + (p6) fma.s1 f9=f9,f7,f6 + nop.i 0;; +} +{.mfi + nop.m 0 + // d2=d*d + (p6) fma.s1 f34=f7,f7,f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // T0=d*S0 + (p6) fma.s1 f6=f7,f14,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // G0=d*H0 + (p6) fma.s1 f7=f7,f15,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + // P=d2*P2+P1 + (p6) fma.s1 f11=f34,f9,f11 + nop.i 0;; +} + +{.mfi + nop.m 0 + // S1=p*T0+S0 + (p6) fma.s1 f14=f11,f6,f14 + nop.i 0 +} +{.mfi + nop.m 0 + // H1=p*G0+H0 + (p6) fma.s1 f15=f11,f7,f15 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // e1=a-S1*S1 + (p6) fnma.s1 f7=f14,f14,f12 + nop.i 0 +} +{.mfi + // Is x^2 + y^2 well less than the overflow + // threshold? + (p6) cmp.lt.unc p7, p8 = r3,r2 + // c=dxy+da + (p6) fma.s1 f13=f13,f1,f10 + nop.i 0;; +} + +{.mfi + nop.m 0 + // e=e1+c + (p6) fma.s1 f13=f7,f1,f13 + nop.i 0;; +} + +{.mfb + nop.m 0 + // S=e*H1+S1 + fma.s0 f8=f13,f15,f14 + // No overflow in this case + (p7) br.ret.sptk b0;; +} + +{ .mfi + nop.m 0 +(p8) fsetc.s2 0x7F,0x42 + // Possible overflow path, must detect by + // Setting widest range exponent with prevailing + // rounding mode. + nop.i 0 ;; +} + + +{ .mfi + // bias+0x4000 (bias+EMAX+1) + (p8) mov r2=0x13fff + // S=e*H1+S1 + (p8) fma.s2 f12=f13,f15,f14 + nop.i 0 ;; +} +{ .mfi +(p8) setf.exp f11 = r2 +(p8) fsetc.s2 0x7F,0x40 +// Restore Original Mode in S2 + nop.i 0 ;; +} +{ .mfi + nop.m 0 +(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11 + nop.i 0 ;; +} +{ .mib + nop.m 0 + mov GR_Parameter_TAG = 45; + // No overflow +(p9) br.ret.sptk b0;; +} +.endp hypotl +ASM_SIZE_DIRECTIVE(hypotl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_log.S b/sysdeps/ia64/fpu/e_log.S new file mode 100644 index 0000000000..09e305dd08 --- /dev/null +++ b/sysdeps/ia64/fpu/e_log.S @@ -0,0 +1,1091 @@ +.file "log.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 6/16/00 Updated table to be rounded correctly +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 8/17/00 Improved speed of main path by 5 cycles +// Shortened path for x=1.0 +// 1/09/01 Improved speed, fixed flags for neg denormals +// +// +// API +//============================================================== +// double log(double) +// double log10(double) +// +// Overview of operation +//============================================================== +// Background +// +// Consider x = 2^N 1.f1 f2 f3 f4...f63 +// Log(x) = log(frcpa(x) x/frcpa(x)) +// = log(1/frcpa(x)) + log(frcpa(x) x) +// = -log(frcpa(x)) + log(frcpa(x) x) +// +// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63) +// +// -log(frcpa(x)) = -log(C) +// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63)) +// +// -log(frcpa(x)) = -log(C) +// = +Nlog2 - log(frcpa(1.f1 f2 ... f63)) +// +// -log(frcpa(x)) = -log(C) +// = +Nlog2 + log(frcpa(1.f1 f2 ... f63)) +// +// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x) + +// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) +// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) +// Log(x) = +Nlog2 + T + log(frcpa(x) x) +// +// Log(x) = +Nlog2 + T + log(C x) +// +// Cx = 1 + r +// +// Log(x) = +Nlog2 + T + log(1+r) +// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....) +// +// 1.f1 f2 ... f8 has 256 entries. +// They are 1 + k/2^8, k = 0 ... 255 +// These 256 values are the table entries. +// +// Implementation +//=============== +// CASE 1: |x-1| >= 2^-6 +// C = frcpa(x) +// r = C * x - 1 +// +// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6 +// +// x = f * 2*n where f is 1.f_1f_2f_3....f_63 +// Nfloat = float(n) where n is the true unbiased exponent +// pre-index = f_1f_2....f_8 +// index = pre_index * 16 +// get the dxt table entry at index + offset = T +// +// result = (T + Nfloat * log(2)) + rseries +// +// The T table is calculated as follows +// Form x_k = 1 + k/2^8 where k goes from 0... 255 +// y_k = frcpa(x_k) +// log(1/y_k) in quad and round to double-extended + +// CASE 2: |x-1| < 2^-6 +// w = x - 1 +// +// Form wseries = w + Q1*w^2 + Q2*w^3 + ... + Q7*w^8 + Q8*w^9 +// +// result = wseries + +// Special values +//============================================================== + + +// log(+0) = -inf +// log(-0) = -inf + +// log(+qnan) = +qnan +// log(-qnan) = -qnan +// log(+snan) = +qnan +// log(-snan) = -qnan + +// log(-n) = QNAN Indefinite +// log(-inf) = QNAN Indefinite + +// log(+inf) = +inf + +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f9 -> f15, f32 -> f68 + +// General registers used: +// r32 -> r51 + +// Predicate registers used: +// p6 -> p15 + +// p8 log base e +// p6 log base e special +// p9 used in the frcpa +// p13 log base e large W +// p14 log base e small w + +// p7 log base 10 +// p10 log base 10 large W +// p11 log base 10 small w +// p12 log base 10 special + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +log_int_Nfloat = f9 +log_Nfloat = f10 + +log_P5 = f11 +log_P4 = f12 +log_P3 = f13 +log_P2 = f14 +log_half = f15 + +log_log2 = f32 +log_T = f33 + +log_rp_p4 = f34 +log_rp_p32 = f35 +log_rp_p2 = f36 +log_w6 = f37 +log_rp_p10 = f38 +log_rcube = f39 +log_rsq = f40 + +log_T_plus_Nlog2 = f41 +log_w3 = f42 + +log_r = f43 +log_C = f44 + +log_w = f45 +log_Q8 = f46 +log_Q7 = f47 +log_Q4 = f48 +log_Q3 = f49 +log_Q6 = f50 +log_Q5 = f51 +log_Q2 = f52 +log_Q1 = f53 +log_P1 = f53 + +log_rp_q7 = f54 +log_rp_q65 = f55 +log_Qlo = f56 + +log_rp_q3 = f57 +log_rp_q21 = f58 +log_Qhi = f59 + +log_wsq = f60 +log_w4 = f61 +log_Q = f62 + +log_inv_ln10 = f63 +log_log10_hi = f64 +log_log10_lo = f65 +log_rp_q10 = f66 +log_NORM_f8 = f67 +log_r2P_r = f68 + +// =================================== + +log_GR_exp_17_ones = r33 +log_GR_exp_16_ones = r34 +log_GR_exp_f8 = r35 +log_GR_signexp_f8 = r36 +log_GR_true_exp_f8 = r37 +log_GR_significand_f8 = r38 +log_GR_half_exp = r39 +log_GR_index = r39 +log_AD_1 = r40 +log_GR_signexp_w = r41 +log_GR_fff9 = r42 +log_AD_2 = r43 +log_GR_exp_w = r44 + +GR_SAVE_B0 = r45 +GR_SAVE_GP = r46 +GR_SAVE_PFS = r47 + +GR_Parameter_X = r48 +GR_Parameter_Y = r49 +GR_Parameter_RESULT = r50 +log_GR_tag = r51 + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +log_table_1: +ASM_TYPE_DIRECTIVE(log_table_1,@object) +data8 0xBFC5555DA7212371 // P5 +data8 0x3FC999A19EEF5826 // P4 +data8 0x3FBC756AC654273B // Q8 +data8 0xBFC001A42489AB4D // Q7 +data8 0x3FC99999999A169B // Q4 +data8 0xBFD00000000019AC // Q3 +ASM_SIZE_DIRECTIVE(log_table_1) +log_table_2: +ASM_TYPE_DIRECTIVE(log_table_2,@object) +data8 0xBFCFFFFFFFFEF009 // P3 +data8 0x3FD555555554ECB2 // P2 +data8 0x3FC2492479AA0DF8 // Q6 +data8 0xBFC5555544986F52 // Q5 +data8 0x3FD5555555555555 // Q2 +data8 0xBFE0000000000000 // Q1, P1 = -0.5 + + +data8 0xde5bd8a937287195, 0x00003ffd // double-extended 1/ln(10) +data8 0xb17217f7d1cf79ac, 0x00003ffe // log2 +// b17217f7d1cf79ab c9e3b39803f2f6a + + +data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8)) + +data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8)) +data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8)) +data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8)) +data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8)) +data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8)) + +data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8)) +data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8)) +data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8)) +data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8)) +data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8)) + +data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8)) +data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8)) +data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8)) +data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8)) +data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8)) + +data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8)) +data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8)) +data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8)) +data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8)) +data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8)) + +data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8)) +data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8)) +data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8)) +data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8)) +data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8)) + +data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8)) +data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8)) +data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8)) +data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8)) +data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8)) + +data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8)) +data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8)) +data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8)) +data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8)) +data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8)) + +data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8)) +data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8)) +data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8)) +data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8)) +data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8)) + +data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8)) +data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8)) +data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8)) +data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8)) +data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8)) + +data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8)) +data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8)) +data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8)) +data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8)) +data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8)) + +data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8)) +data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8)) +data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8)) +data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8)) +data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8)) + +data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8)) +data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8)) +data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8)) +data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8)) +data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8)) + +data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8)) +data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8)) +data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8)) +data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8)) +data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8)) + +data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8)) +data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8)) +data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8)) +data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8)) +data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8)) + +data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8)) +data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8)) +data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8)) +data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8)) +data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8)) + +data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8)) +data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8)) +data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8)) +data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8)) +data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8)) + +data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8)) +data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8)) +data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8)) +data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8)) +data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8)) + +data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8)) +data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8)) +data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8)) +data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8)) +data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8)) + +data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8)) +data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8)) +data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8)) +data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8)) +data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8)) + +data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8)) +data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8)) +data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8)) +data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8)) +data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8)) + +data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8)) +data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8)) +data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8)) +data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8)) +data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8)) + +data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8)) +data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8)) +data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8)) +data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8)) +data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8)) + +data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8)) +data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8)) +data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8)) +data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8)) +data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8)) + +data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8)) +data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8)) +data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8)) +data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8)) +data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8)) + +data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8)) +data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8)) +data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8)) +data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8)) +data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8)) + +data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8)) +data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8)) +data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8)) +data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8)) +data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8)) + +data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8)) +data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8)) +data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8)) +data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8)) +data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8)) + +data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8)) +data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8)) +data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8)) +data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8)) +data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8)) + +data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8)) +data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8)) +data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8)) +data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8)) +data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8)) + +data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8)) +data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8)) +data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8)) +data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8)) +data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8)) + +data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8)) +data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8)) +data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8)) +data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8)) +data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8)) + +data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8)) +data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8)) +data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8)) +data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8)) +data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8)) + +data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8)) +data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8)) +data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8)) +data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8)) +data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8)) + +data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8)) +data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8)) +data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8)) +data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8)) +data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8)) + +data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8)) +data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8)) +data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8)) +data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8)) +data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8)) + +data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8)) +data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8)) +data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8)) +data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8)) +data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8)) + +data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8)) +data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8)) +data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8)) +data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8)) +data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8)) + +data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8)) +data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8)) +data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8)) +data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8)) +data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8)) + +data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8)) +data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8)) +data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8)) +data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8)) +data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8)) + +data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8)) +data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8)) +data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8)) +data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8)) +data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8)) + +data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8)) +data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8)) +data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8)) +data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8)) +data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8)) + +data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8)) +data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8)) +data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8)) +data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8)) +data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8)) + +data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8)) +data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8)) +data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8)) +data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8)) +data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8)) + +data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8)) +data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8)) +data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8)) +data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8)) +data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8)) + +data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8)) +data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8)) +data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8)) +data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8)) +data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8)) + +data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8)) +data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8)) +data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8)) +data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8)) +data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8)) + +data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8)) +data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8)) +data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8)) +data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8)) +data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8)) + +data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8)) +data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8)) +data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8)) +data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8)) +data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8)) + +data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8)) +data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8)) +data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8)) +data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8)) +data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8)) + +data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8)) +data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8)) +data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8)) +data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8)) +data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8)) + +data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8)) +data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8)) +data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8)) +data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8)) +data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8)) +ASM_SIZE_DIRECTIVE(log_table_2) + + +.align 32 +.global log# +.global log10# + +// log10 has p7 true, p8 false +// log has p8 true, p7 false + +.section .text +.proc log10# +.align 32 + +log10: +#ifdef _LIBC +.global __ieee754_log10 +.type __ieee754_log10,@function +__ieee754_log10: +#endif +{ .mfi + alloc r32=ar.pfs,1,15,4,0 + frcpa.s1 log_C,p9 = f1,f8 + cmp.eq.unc p7,p8 = r0, r0 +} +{ .mfb + addl log_AD_1 = @ltoff(log_table_1), gp + fnorm.s1 log_NORM_f8 = f8 + br.sptk L(LOG_LOG10_X) +} +;; + +.endp log10 +ASM_SIZE_DIRECTIVE(log10) +ASM_SIZE_DIRECTIVE(__ieee754_log10) + + +.section .text +.proc log# +.align 32 +log: +#ifdef _LIBC +.global __ieee754_log +.type __ieee754_log,@function +__ieee754_log: +#endif + +{ .mfi + alloc r32=ar.pfs,1,15,4,0 + frcpa.s1 log_C,p9 = f1,f8 + cmp.eq.unc p8,p7 = r0, r0 +} +{ .mfi + addl log_AD_1 = @ltoff(log_table_1), gp + fnorm.s1 log_NORM_f8 = f8 + nop.i 999 +} +;; + +L(LOG_LOG10_X): + +{ .mfi + ld8 log_AD_1 = [log_AD_1] + fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm + mov log_GR_fff9 = 0xfff9 +} +{ .mfi + mov log_GR_half_exp = 0x0fffe + fms.s1 log_w = f8,f1,f1 + mov log_GR_exp_17_ones = 0x1ffff +} +;; + +{ .mmi + getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute + setf.exp log_half = log_GR_half_exp // Form 0.5 = -Q1 + nop.i 999 +} +;; + +{ .mmb + adds log_AD_2 = 0x30, log_AD_1 + mov log_GR_exp_16_ones = 0xffff +(p15) br.cond.spnt L(LOG_DENORM) +} +;; + +L(LOG_COMMON): +{.mfi + ldfpd log_P5,log_P4 = [log_AD_1],16 + fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan + and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones +} +{.mfi + ldfpd log_P3,log_P2 = [log_AD_2],16 + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + ldfpd log_Q8,log_Q7 = [log_AD_1],16 + fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf + sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones +} +{ .mfi + ldfpd log_Q6,log_Q5 = [log_AD_2],16 + nop.f 999 + nop.i 999 +} +;; + + +{ .mfi + ldfpd log_Q4,log_Q3 = [log_AD_1],16 + fma.s1 log_wsq = log_w, log_w, f0 + nop.i 999 +} +{ .mfb + ldfpd log_Q2,log_Q1 = [log_AD_2],16 +(p6) fma.d.s0 f8 = f8,f1,f0 // quietize nan result if x=nan +(p6) br.ret.spnt b0 // Exit for x=nan +} +;; + + +{ .mfi + setf.sig log_int_Nfloat = log_GR_true_exp_f8 + fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0 + nop.i 999 +} +{ .mfb + nop.m 999 + fms.s1 log_r = log_C,f8,f1 +(p11) br.ret.spnt b0 // Exit for x=+inf +} +;; + + +{ .mmf + getf.sig log_GR_significand_f8 = log_NORM_f8 + ldfe log_inv_ln10 = [log_AD_2],16 + fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +} +;; + + +{ .mfb + nop.m 999 +(p10) fmerge.s f8 = f0, f0 +(p10) br.ret.spnt b0 // Exit for x=1.0 +;; +} + +{ .mfi + getf.exp log_GR_signexp_w = log_w + fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf + shl log_GR_index = log_GR_significand_f8,1 +} +;; + +{ .mfi + ldfe log_log2 = [log_AD_2],16 + fnma.s1 log_rp_q10 = log_half, log_wsq, log_w + shr.u log_GR_index = log_GR_index,56 +} +{ .mfb + nop.m 999 + fma.s1 log_w3 = log_wsq, log_w, f0 +(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0 +;; +} + + +{ .mfi + and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w + fma.s1 log_w4 = log_wsq, log_wsq, f0 + nop.i 999 +} +{ .mfb + shladd log_AD_2 = log_GR_index,4,log_AD_2 + fma.s1 log_rsq = log_r, log_r, f0 +(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0 +;; +} + +{ .mfi + ldfe log_T = [log_AD_2] + fma.s1 log_rp_p4 = log_P5, log_r, log_P4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_p32 = log_P3, log_r, log_P2 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 log_rp_q7 = log_Q8, log_w, log_Q7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_q65 = log_Q6, log_w, log_Q5 + nop.i 999 +;; +} + +// p13 <== large w log +// p14 <== small w log +{ .mfi +(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff9 + fma.s1 log_rp_q3 = log_Q4, log_w, log_Q3 + nop.i 999 +;; +} + +// p10 <== large w log10 +// p11 <== small w log10 +{ .mfi +(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff9 + fcvt.xf log_Nfloat = log_int_Nfloat + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 log_rp_q21 = log_Q2, log_w3, log_rp_q10 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 log_rcube = log_rsq, log_r, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_p10 = log_rsq, log_P1, log_r + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32 + nop.i 999 +;; +} + + +{ .mfi + nop.m 999 + fma.s1 log_w6 = log_w3, log_w3, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_Qlo = log_rp_q7, log_wsq, log_rp_q65 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 log_Qhi = log_rp_q3, log_w4, log_rp_q21 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10 + nop.i 999 ;; +} + + +// small w, log <== p14 +{ .mfi + nop.m 999 +(p14) fma.d f8 = log_Qlo, log_w6, log_Qhi + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_Q = log_Qlo, log_w6, log_Qhi + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p10) fma.s1 log_log10_hi = log_T_plus_Nlog2, log_inv_ln10,f0 + nop.i 999 ;; +} + +// large w, log <== p13 +.pred.rel "mutex",p13,p10 +{ .mfi + nop.m 999 +(p13) fadd.d f8 = log_T_plus_Nlog2, log_r2P_r + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 log_log10_lo = log_inv_ln10, log_r2P_r,f0 + nop.i 999 ;; +} + + +// small w, log10 <== p11 +{ .mfi + nop.m 999 +(p11) fma.d f8 = log_inv_ln10,log_Q,f0 + nop.i 999 ;; +} + +// large w, log10 <== p10 +{ .mfb + nop.m 999 +(p10) fma.d f8 = log_log10_hi, f1, log_log10_lo + br.ret.sptk b0 +;; +} + +L(LOG_DENORM): +{ .mfb + getf.exp log_GR_signexp_f8 = log_NORM_f8 + nop.f 999 + br.cond.sptk L(LOG_COMMON) +} +;; + +L(LOG_ZERO_NEG): + +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 0 1 11 0x7 +// 0 0 1 1 1 0 10 0x3a + +// Save x (f8) in f10 +{ .mfi + nop.m 999 + fmerge.s f10 = f8,f8 + nop.i 999 ;; +} + +// p8 p9 means ln(+-0) = -inf +// p7 p10 means log(+-0) = -inf + +// p13 means ln(-) +// p14 means log(-) + + +{ .mfi + nop.m 999 + fmerge.ns f6 = f1,f1 // Form -1.0 + nop.i 999 ;; +} + +// p9 means ln(+-0) = -inf +// p10 means log(+-0) = -inf +// Log(+-0) = -inf + +{ .mfi + nop.m 999 +(p8) fclass.m.unc p9,p0 = f10, 0x07 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10,p0 = f10, 0x07 + nop.i 999 ;; +} + + +// p13 ln(-) +// p14 log(-) + +// Log(-inf, -normal, -unnormal) = QNAN indefinite +{ .mfi + nop.m 999 +(p8) fclass.m.unc p13,p0 = f10, 0x3a + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p14,p0 = f10, 0x3a + nop.i 999 ;; +} + + +.pred.rel "mutex",p9,p10 +{ .mfi +(p9) mov log_GR_tag = 2 +(p9) frcpa f8,p11 = f6,f0 + nop.i 999 +} +{ .mfi +(p10) mov log_GR_tag = 8 +(p10) frcpa f8,p12 = f6,f0 + nop.i 999 ;; +} + +.pred.rel "mutex",p13,p14 +{ .mfi +(p13) mov log_GR_tag = 3 +(p13) frcpa f8,p11 = f0,f0 + nop.i 999 +} +{ .mfb +(p14) mov log_GR_tag = 9 +(p14) frcpa f8,p12 = f0,f0 + br.cond.sptk __libm_error_region ;; +} +.endp log +ASM_SIZE_DIRECTIVE(log) +ASM_SIZE_DIRECTIVE(__ieee754_log) + + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + + +.proc __libm_error_region +__libm_error_region: +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f10 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; + +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_log10.c b/sysdeps/ia64/fpu/e_log10.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_log10.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_log10f.c b/sysdeps/ia64/fpu/e_log10f.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_log10f.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_log10l.c b/sysdeps/ia64/fpu/e_log10l.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_log10l.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_logf.S b/sysdeps/ia64/fpu/e_logf.S new file mode 100644 index 0000000000..1799e4c1c2 --- /dev/null +++ b/sysdeps/ia64/fpu/e_logf.S @@ -0,0 +1,946 @@ +.file "logf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 3/01/00 Initial version +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/10/01 Improved speed, fixed flags for neg denormals +// +// +// API +//============================================================== +// float logf(float) +// float log10f(float) +// +// Overview of operation +//============================================================== +// Background +// +// Consider x = 2^N 1.f1 f2 f3 f4...f63 +// Log(x) = log(frcpa(x) x/frcpa(x)) +// = log(1/frcpa(x)) + log(frcpa(x) x) +// = -log(frcpa(x)) + log(frcpa(x) x) +// +// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63) +// +// -log(frcpa(x)) = -log(C) +// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63)) +// +// -log(frcpa(x)) = -log(C) +// = +Nlog2 - log(frcpa(1.f1 f2 ... f63)) +// +// -log(frcpa(x)) = -log(C) +// = +Nlog2 + log(frcpa(1.f1 f2 ... f63)) +// +// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x) + +// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) +// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) +// Log(x) = +Nlog2 + T + log(frcpa(x) x) +// +// Log(x) = +Nlog2 + T + log(C x) +// +// Cx = 1 + r +// +// Log(x) = +Nlog2 + T + log(1+r) +// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....) +// +// 1.f1 f2 ... f8 has 256 entries. +// They are 1 + k/2^8, k = 0 ... 255 +// These 256 values are the table entries. +// +// Implementation +//=============== +// CASE 1: |x-1| >= 2^-8 +// C = frcpa(x) +// r = C * x - 1 +// +// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 +// +// x = f * 2*n where f is 1.f_1f_2f_3....f_63 +// Nfloat = float(n) where n is the true unbiased exponent +// pre-index = f_1f_2....f_8 +// index = pre_index * 16 +// get the dxt table entry at index + offset = T +// +// result = (T + Nfloat * log(2)) + rseries +// +// The T table is calculated as follows +// Form x_k = 1 + k/2^8 where k goes from 0... 255 +// y_k = frcpa(x_k) +// log(1/y_k) in quad and round to double + +// CASE 2: |x-1| < 2^-6 +// w = x - 1 +// +// Form wseries = w + Q1*w^2 + Q2*w^3 + Q3*w^4 +// +// result = wseries + +// Special values +//============================================================== + + +// log(+0) = -inf +// log(-0) = -inf + +// log(+qnan) = +qnan +// log(-qnan) = -qnan +// log(+snan) = +qnan +// log(-snan) = -qnan + +// log(-n) = QNAN Indefinite +// log(-inf) = QNAN Indefinite + +// log(+inf) = +inf + +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f9 -> f15, f32 -> f47 + +// General registers used: +// r32 -> r51 + +// Predicate registers used: +// p6 -> p15 + +// p8 log base e +// p6 log base e special +// p9 used in the frcpa +// p13 log base e large W +// p14 log base e small w + +// p7 log base 10 +// p10 log base 10 large W +// p11 log base 10 small w +// p12 log base 10 special + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +log_int_Nfloat = f9 +log_Nfloat = f10 + +log_P3 = f11 +log_P2 = f12 +log_P1 = f13 +log_inv_ln10 = f14 +log_log2 = f15 + +log_w = f32 +log_T = f33 +log_rp_p32 = f34 +log_rp_p2 = f35 +log_rp_p10 = f36 +log_rsq = f37 +log_T_plus_Nlog2 = f38 +log_r = f39 +log_C = f40 +log_rp_q32 = f41 +log_rp_q2 = f42 +log_rp_q10 = f43 +log_wsq = f44 +log_Q = f45 +log_inv_ln10 = f46 +log_NORM_f8 = f47 + +// =================================== + +log_GR_exp_17_ones = r33 +log_GR_exp_16_ones = r34 +log_GR_exp_f8 = r35 +log_GR_signexp_f8 = r36 +log_GR_true_exp_f8 = r37 +log_GR_significand_f8 = r38 +log_GR_index = r39 +log_AD_1 = r40 +log_GR_signexp_w = r41 +log_GR_fff7 = r42 +log_AD_2 = r43 +log_GR_exp_w = r44 + +GR_SAVE_B0 = r45 +GR_SAVE_GP = r46 +GR_SAVE_PFS = r47 + +GR_Parameter_X = r48 +GR_Parameter_Y = r49 +GR_Parameter_RESULT = r50 +log_GR_tag = r51 + + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +log_table_1: +ASM_TYPE_DIRECTIVE(log_table_1,@object) +data8 0xbfd0001008f39d59 // p3 +data8 0x3fd5556073e0c45a // p2 +ASM_SIZE_DIRECTIVE(log_table_1) + +log_table_2: +ASM_TYPE_DIRECTIVE(log_table_2,@object) +data8 0xbfdffffffffaea15 // p1 +data8 0x3fdbcb7b1526e50e // 1/ln10 +data8 0x3fe62e42fefa39ef // Log(2) +data8 0x0 // pad + +data8 0x3F60040155D5889E //log(1/frcpa(1+ 0/256) +data8 0x3F78121214586B54 //log(1/frcpa(1+ 1/256) +data8 0x3F841929F96832F0 //log(1/frcpa(1+ 2/256) +data8 0x3F8C317384C75F06 //log(1/frcpa(1+ 3/256) +data8 0x3F91A6B91AC73386 //log(1/frcpa(1+ 4/256) +data8 0x3F95BA9A5D9AC039 //log(1/frcpa(1+ 5/256) +data8 0x3F99D2A8074325F4 //log(1/frcpa(1+ 6/256) +data8 0x3F9D6B2725979802 //log(1/frcpa(1+ 7/256) +data8 0x3FA0C58FA19DFAAA //log(1/frcpa(1+ 8/256) +data8 0x3FA2954C78CBCE1B //log(1/frcpa(1+ 9/256) +data8 0x3FA4A94D2DA96C56 //log(1/frcpa(1+ 10/256) +data8 0x3FA67C94F2D4BB58 //log(1/frcpa(1+ 11/256) +data8 0x3FA85188B630F068 //log(1/frcpa(1+ 12/256) +data8 0x3FAA6B8ABE73AF4C //log(1/frcpa(1+ 13/256) +data8 0x3FAC441E06F72A9E //log(1/frcpa(1+ 14/256) +data8 0x3FAE1E6713606D07 //log(1/frcpa(1+ 15/256) +data8 0x3FAFFA6911AB9301 //log(1/frcpa(1+ 16/256) +data8 0x3FB0EC139C5DA601 //log(1/frcpa(1+ 17/256) +data8 0x3FB1DBD2643D190B //log(1/frcpa(1+ 18/256) +data8 0x3FB2CC7284FE5F1C //log(1/frcpa(1+ 19/256) +data8 0x3FB3BDF5A7D1EE64 //log(1/frcpa(1+ 20/256) +data8 0x3FB4B05D7AA012E0 //log(1/frcpa(1+ 21/256) +data8 0x3FB580DB7CEB5702 //log(1/frcpa(1+ 22/256) +data8 0x3FB674F089365A7A //log(1/frcpa(1+ 23/256) +data8 0x3FB769EF2C6B568D //log(1/frcpa(1+ 24/256) +data8 0x3FB85FD927506A48 //log(1/frcpa(1+ 25/256) +data8 0x3FB9335E5D594989 //log(1/frcpa(1+ 26/256) +data8 0x3FBA2B0220C8E5F5 //log(1/frcpa(1+ 27/256) +data8 0x3FBB0004AC1A86AC //log(1/frcpa(1+ 28/256) +data8 0x3FBBF968769FCA11 //log(1/frcpa(1+ 29/256) +data8 0x3FBCCFEDBFEE13A8 //log(1/frcpa(1+ 30/256) +data8 0x3FBDA727638446A2 //log(1/frcpa(1+ 31/256) +data8 0x3FBEA3257FE10F7A //log(1/frcpa(1+ 32/256) +data8 0x3FBF7BE9FEDBFDE6 //log(1/frcpa(1+ 33/256) +data8 0x3FC02AB352FF25F4 //log(1/frcpa(1+ 34/256) +data8 0x3FC097CE579D204D //log(1/frcpa(1+ 35/256) +data8 0x3FC1178E8227E47C //log(1/frcpa(1+ 36/256) +data8 0x3FC185747DBECF34 //log(1/frcpa(1+ 37/256) +data8 0x3FC1F3B925F25D41 //log(1/frcpa(1+ 38/256) +data8 0x3FC2625D1E6DDF57 //log(1/frcpa(1+ 39/256) +data8 0x3FC2D1610C86813A //log(1/frcpa(1+ 40/256) +data8 0x3FC340C59741142E //log(1/frcpa(1+ 41/256) +data8 0x3FC3B08B6757F2A9 //log(1/frcpa(1+ 42/256) +data8 0x3FC40DFB08378003 //log(1/frcpa(1+ 43/256) +data8 0x3FC47E74E8CA5F7C //log(1/frcpa(1+ 44/256) +data8 0x3FC4EF51F6466DE4 //log(1/frcpa(1+ 45/256) +data8 0x3FC56092E02BA516 //log(1/frcpa(1+ 46/256) +data8 0x3FC5D23857CD74D5 //log(1/frcpa(1+ 47/256) +data8 0x3FC6313A37335D76 //log(1/frcpa(1+ 48/256) +data8 0x3FC6A399DABBD383 //log(1/frcpa(1+ 49/256) +data8 0x3FC70337DD3CE41B //log(1/frcpa(1+ 50/256) +data8 0x3FC77654128F6127 //log(1/frcpa(1+ 51/256) +data8 0x3FC7E9D82A0B022D //log(1/frcpa(1+ 52/256) +data8 0x3FC84A6B759F512F //log(1/frcpa(1+ 53/256) +data8 0x3FC8AB47D5F5A310 //log(1/frcpa(1+ 54/256) +data8 0x3FC91FE49096581B //log(1/frcpa(1+ 55/256) +data8 0x3FC981634011AA75 //log(1/frcpa(1+ 56/256) +data8 0x3FC9F6C407089664 //log(1/frcpa(1+ 57/256) +data8 0x3FCA58E729348F43 //log(1/frcpa(1+ 58/256) +data8 0x3FCABB55C31693AD //log(1/frcpa(1+ 59/256) +data8 0x3FCB1E104919EFD0 //log(1/frcpa(1+ 60/256) +data8 0x3FCB94EE93E367CB //log(1/frcpa(1+ 61/256) +data8 0x3FCBF851C067555F //log(1/frcpa(1+ 62/256) +data8 0x3FCC5C0254BF23A6 //log(1/frcpa(1+ 63/256) +data8 0x3FCCC000C9DB3C52 //log(1/frcpa(1+ 64/256) +data8 0x3FCD244D99C85674 //log(1/frcpa(1+ 65/256) +data8 0x3FCD88E93FB2F450 //log(1/frcpa(1+ 66/256) +data8 0x3FCDEDD437EAEF01 //log(1/frcpa(1+ 67/256) +data8 0x3FCE530EFFE71012 //log(1/frcpa(1+ 68/256) +data8 0x3FCEB89A1648B971 //log(1/frcpa(1+ 69/256) +data8 0x3FCF1E75FADF9BDE //log(1/frcpa(1+ 70/256) +data8 0x3FCF84A32EAD7C35 //log(1/frcpa(1+ 71/256) +data8 0x3FCFEB2233EA07CD //log(1/frcpa(1+ 72/256) +data8 0x3FD028F9C7035C1C //log(1/frcpa(1+ 73/256) +data8 0x3FD05C8BE0D9635A //log(1/frcpa(1+ 74/256) +data8 0x3FD085EB8F8AE797 //log(1/frcpa(1+ 75/256) +data8 0x3FD0B9C8E32D1911 //log(1/frcpa(1+ 76/256) +data8 0x3FD0EDD060B78081 //log(1/frcpa(1+ 77/256) +data8 0x3FD122024CF0063F //log(1/frcpa(1+ 78/256) +data8 0x3FD14BE2927AECD4 //log(1/frcpa(1+ 79/256) +data8 0x3FD180618EF18ADF //log(1/frcpa(1+ 80/256) +data8 0x3FD1B50BBE2FC63B //log(1/frcpa(1+ 81/256) +data8 0x3FD1DF4CC7CF242D //log(1/frcpa(1+ 82/256) +data8 0x3FD214456D0EB8D4 //log(1/frcpa(1+ 83/256) +data8 0x3FD23EC5991EBA49 //log(1/frcpa(1+ 84/256) +data8 0x3FD2740D9F870AFB //log(1/frcpa(1+ 85/256) +data8 0x3FD29ECDABCDFA04 //log(1/frcpa(1+ 86/256) +data8 0x3FD2D46602ADCCEE //log(1/frcpa(1+ 87/256) +data8 0x3FD2FF66B04EA9D4 //log(1/frcpa(1+ 88/256) +data8 0x3FD335504B355A37 //log(1/frcpa(1+ 89/256) +data8 0x3FD360925EC44F5D //log(1/frcpa(1+ 90/256) +data8 0x3FD38BF1C3337E75 //log(1/frcpa(1+ 91/256) +data8 0x3FD3C25277333184 //log(1/frcpa(1+ 92/256) +data8 0x3FD3EDF463C1683E //log(1/frcpa(1+ 93/256) +data8 0x3FD419B423D5E8C7 //log(1/frcpa(1+ 94/256) +data8 0x3FD44591E0539F49 //log(1/frcpa(1+ 95/256) +data8 0x3FD47C9175B6F0AD //log(1/frcpa(1+ 96/256) +data8 0x3FD4A8B341552B09 //log(1/frcpa(1+ 97/256) +data8 0x3FD4D4F3908901A0 //log(1/frcpa(1+ 98/256) +data8 0x3FD501528DA1F968 //log(1/frcpa(1+ 99/256) +data8 0x3FD52DD06347D4F6 //log(1/frcpa(1+ 100/256) +data8 0x3FD55A6D3C7B8A8A //log(1/frcpa(1+ 101/256) +data8 0x3FD5925D2B112A59 //log(1/frcpa(1+ 102/256) +data8 0x3FD5BF406B543DB2 //log(1/frcpa(1+ 103/256) +data8 0x3FD5EC433D5C35AE //log(1/frcpa(1+ 104/256) +data8 0x3FD61965CDB02C1F //log(1/frcpa(1+ 105/256) +data8 0x3FD646A84935B2A2 //log(1/frcpa(1+ 106/256) +data8 0x3FD6740ADD31DE94 //log(1/frcpa(1+ 107/256) +data8 0x3FD6A18DB74A58C5 //log(1/frcpa(1+ 108/256) +data8 0x3FD6CF31058670EC //log(1/frcpa(1+ 109/256) +data8 0x3FD6F180E852F0BA //log(1/frcpa(1+ 110/256) +data8 0x3FD71F5D71B894F0 //log(1/frcpa(1+ 111/256) +data8 0x3FD74D5AEFD66D5C //log(1/frcpa(1+ 112/256) +data8 0x3FD77B79922BD37E //log(1/frcpa(1+ 113/256) +data8 0x3FD7A9B9889F19E2 //log(1/frcpa(1+ 114/256) +data8 0x3FD7D81B037EB6A6 //log(1/frcpa(1+ 115/256) +data8 0x3FD8069E33827231 //log(1/frcpa(1+ 116/256) +data8 0x3FD82996D3EF8BCB //log(1/frcpa(1+ 117/256) +data8 0x3FD85855776DCBFB //log(1/frcpa(1+ 118/256) +data8 0x3FD8873658327CCF //log(1/frcpa(1+ 119/256) +data8 0x3FD8AA75973AB8CF //log(1/frcpa(1+ 120/256) +data8 0x3FD8D992DC8824E5 //log(1/frcpa(1+ 121/256) +data8 0x3FD908D2EA7D9512 //log(1/frcpa(1+ 122/256) +data8 0x3FD92C59E79C0E56 //log(1/frcpa(1+ 123/256) +data8 0x3FD95BD750EE3ED3 //log(1/frcpa(1+ 124/256) +data8 0x3FD98B7811A3EE5B //log(1/frcpa(1+ 125/256) +data8 0x3FD9AF47F33D406C //log(1/frcpa(1+ 126/256) +data8 0x3FD9DF270C1914A8 //log(1/frcpa(1+ 127/256) +data8 0x3FDA0325ED14FDA4 //log(1/frcpa(1+ 128/256) +data8 0x3FDA33440224FA79 //log(1/frcpa(1+ 129/256) +data8 0x3FDA57725E80C383 //log(1/frcpa(1+ 130/256) +data8 0x3FDA87D0165DD199 //log(1/frcpa(1+ 131/256) +data8 0x3FDAAC2E6C03F896 //log(1/frcpa(1+ 132/256) +data8 0x3FDADCCC6FDF6A81 //log(1/frcpa(1+ 133/256) +data8 0x3FDB015B3EB1E790 //log(1/frcpa(1+ 134/256) +data8 0x3FDB323A3A635948 //log(1/frcpa(1+ 135/256) +data8 0x3FDB56FA04462909 //log(1/frcpa(1+ 136/256) +data8 0x3FDB881AA659BC93 //log(1/frcpa(1+ 137/256) +data8 0x3FDBAD0BEF3DB165 //log(1/frcpa(1+ 138/256) +data8 0x3FDBD21297781C2F //log(1/frcpa(1+ 139/256) +data8 0x3FDC039236F08819 //log(1/frcpa(1+ 140/256) +data8 0x3FDC28CB1E4D32FD //log(1/frcpa(1+ 141/256) +data8 0x3FDC4E19B84723C2 //log(1/frcpa(1+ 142/256) +data8 0x3FDC7FF9C74554C9 //log(1/frcpa(1+ 143/256) +data8 0x3FDCA57B64E9DB05 //log(1/frcpa(1+ 144/256) +data8 0x3FDCCB130A5CEBB0 //log(1/frcpa(1+ 145/256) +data8 0x3FDCF0C0D18F326F //log(1/frcpa(1+ 146/256) +data8 0x3FDD232075B5A201 //log(1/frcpa(1+ 147/256) +data8 0x3FDD490246DEFA6B //log(1/frcpa(1+ 148/256) +data8 0x3FDD6EFA918D25CD //log(1/frcpa(1+ 149/256) +data8 0x3FDD9509707AE52F //log(1/frcpa(1+ 150/256) +data8 0x3FDDBB2EFE92C554 //log(1/frcpa(1+ 151/256) +data8 0x3FDDEE2F3445E4AF //log(1/frcpa(1+ 152/256) +data8 0x3FDE148A1A2726CE //log(1/frcpa(1+ 153/256) +data8 0x3FDE3AFC0A49FF40 //log(1/frcpa(1+ 154/256) +data8 0x3FDE6185206D516E //log(1/frcpa(1+ 155/256) +data8 0x3FDE882578823D52 //log(1/frcpa(1+ 156/256) +data8 0x3FDEAEDD2EAC990C //log(1/frcpa(1+ 157/256) +data8 0x3FDED5AC5F436BE3 //log(1/frcpa(1+ 158/256) +data8 0x3FDEFC9326D16AB9 //log(1/frcpa(1+ 159/256) +data8 0x3FDF2391A2157600 //log(1/frcpa(1+ 160/256) +data8 0x3FDF4AA7EE03192D //log(1/frcpa(1+ 161/256) +data8 0x3FDF71D627C30BB0 //log(1/frcpa(1+ 162/256) +data8 0x3FDF991C6CB3B379 //log(1/frcpa(1+ 163/256) +data8 0x3FDFC07ADA69A910 //log(1/frcpa(1+ 164/256) +data8 0x3FDFE7F18EB03D3E //log(1/frcpa(1+ 165/256) +data8 0x3FE007C053C5002E //log(1/frcpa(1+ 166/256) +data8 0x3FE01B942198A5A1 //log(1/frcpa(1+ 167/256) +data8 0x3FE02F74400C64EB //log(1/frcpa(1+ 168/256) +data8 0x3FE04360BE7603AD //log(1/frcpa(1+ 169/256) +data8 0x3FE05759AC47FE34 //log(1/frcpa(1+ 170/256) +data8 0x3FE06B5F1911CF52 //log(1/frcpa(1+ 171/256) +data8 0x3FE078BF0533C568 //log(1/frcpa(1+ 172/256) +data8 0x3FE08CD9687E7B0E //log(1/frcpa(1+ 173/256) +data8 0x3FE0A10074CF9019 //log(1/frcpa(1+ 174/256) +data8 0x3FE0B5343A234477 //log(1/frcpa(1+ 175/256) +data8 0x3FE0C974C89431CE //log(1/frcpa(1+ 176/256) +data8 0x3FE0DDC2305B9886 //log(1/frcpa(1+ 177/256) +data8 0x3FE0EB524BAFC918 //log(1/frcpa(1+ 178/256) +data8 0x3FE0FFB54213A476 //log(1/frcpa(1+ 179/256) +data8 0x3FE114253DA97D9F //log(1/frcpa(1+ 180/256) +data8 0x3FE128A24F1D9AFF //log(1/frcpa(1+ 181/256) +data8 0x3FE1365252BF0865 //log(1/frcpa(1+ 182/256) +data8 0x3FE14AE558B4A92D //log(1/frcpa(1+ 183/256) +data8 0x3FE15F85A19C765B //log(1/frcpa(1+ 184/256) +data8 0x3FE16D4D38C119FA //log(1/frcpa(1+ 185/256) +data8 0x3FE18203C20DD133 //log(1/frcpa(1+ 186/256) +data8 0x3FE196C7BC4B1F3B //log(1/frcpa(1+ 187/256) +data8 0x3FE1A4A738B7A33C //log(1/frcpa(1+ 188/256) +data8 0x3FE1B981C0C9653D //log(1/frcpa(1+ 189/256) +data8 0x3FE1CE69E8BB106B //log(1/frcpa(1+ 190/256) +data8 0x3FE1DC619DE06944 //log(1/frcpa(1+ 191/256) +data8 0x3FE1F160A2AD0DA4 //log(1/frcpa(1+ 192/256) +data8 0x3FE2066D7740737E //log(1/frcpa(1+ 193/256) +data8 0x3FE2147DBA47A394 //log(1/frcpa(1+ 194/256) +data8 0x3FE229A1BC5EBAC3 //log(1/frcpa(1+ 195/256) +data8 0x3FE237C1841A502E //log(1/frcpa(1+ 196/256) +data8 0x3FE24CFCE6F80D9A //log(1/frcpa(1+ 197/256) +data8 0x3FE25B2C55CD5762 //log(1/frcpa(1+ 198/256) +data8 0x3FE2707F4D5F7C41 //log(1/frcpa(1+ 199/256) +data8 0x3FE285E0842CA384 //log(1/frcpa(1+ 200/256) +data8 0x3FE294294708B773 //log(1/frcpa(1+ 201/256) +data8 0x3FE2A9A2670AFF0C //log(1/frcpa(1+ 202/256) +data8 0x3FE2B7FB2C8D1CC1 //log(1/frcpa(1+ 203/256) +data8 0x3FE2C65A6395F5F5 //log(1/frcpa(1+ 204/256) +data8 0x3FE2DBF557B0DF43 //log(1/frcpa(1+ 205/256) +data8 0x3FE2EA64C3F97655 //log(1/frcpa(1+ 206/256) +data8 0x3FE3001823684D73 //log(1/frcpa(1+ 207/256) +data8 0x3FE30E97E9A8B5CD //log(1/frcpa(1+ 208/256) +data8 0x3FE32463EBDD34EA //log(1/frcpa(1+ 209/256) +data8 0x3FE332F4314AD796 //log(1/frcpa(1+ 210/256) +data8 0x3FE348D90E7464D0 //log(1/frcpa(1+ 211/256) +data8 0x3FE35779F8C43D6E //log(1/frcpa(1+ 212/256) +data8 0x3FE36621961A6A99 //log(1/frcpa(1+ 213/256) +data8 0x3FE37C299F3C366A //log(1/frcpa(1+ 214/256) +data8 0x3FE38AE2171976E7 //log(1/frcpa(1+ 215/256) +data8 0x3FE399A157A603E7 //log(1/frcpa(1+ 216/256) +data8 0x3FE3AFCCFE77B9D1 //log(1/frcpa(1+ 217/256) +data8 0x3FE3BE9D503533B5 //log(1/frcpa(1+ 218/256) +data8 0x3FE3CD7480B4A8A3 //log(1/frcpa(1+ 219/256) +data8 0x3FE3E3C43918F76C //log(1/frcpa(1+ 220/256) +data8 0x3FE3F2ACB27ED6C7 //log(1/frcpa(1+ 221/256) +data8 0x3FE4019C2125CA93 //log(1/frcpa(1+ 222/256) +data8 0x3FE4181061389722 //log(1/frcpa(1+ 223/256) +data8 0x3FE42711518DF545 //log(1/frcpa(1+ 224/256) +data8 0x3FE436194E12B6BF //log(1/frcpa(1+ 225/256) +data8 0x3FE445285D68EA69 //log(1/frcpa(1+ 226/256) +data8 0x3FE45BCC464C893A //log(1/frcpa(1+ 227/256) +data8 0x3FE46AED21F117FC //log(1/frcpa(1+ 228/256) +data8 0x3FE47A1527E8A2D3 //log(1/frcpa(1+ 229/256) +data8 0x3FE489445EFFFCCC //log(1/frcpa(1+ 230/256) +data8 0x3FE4A018BCB69835 //log(1/frcpa(1+ 231/256) +data8 0x3FE4AF5A0C9D65D7 //log(1/frcpa(1+ 232/256) +data8 0x3FE4BEA2A5BDBE87 //log(1/frcpa(1+ 233/256) +data8 0x3FE4CDF28F10AC46 //log(1/frcpa(1+ 234/256) +data8 0x3FE4DD49CF994058 //log(1/frcpa(1+ 235/256) +data8 0x3FE4ECA86E64A684 //log(1/frcpa(1+ 236/256) +data8 0x3FE503C43CD8EB68 //log(1/frcpa(1+ 237/256) +data8 0x3FE513356667FC57 //log(1/frcpa(1+ 238/256) +data8 0x3FE522AE0738A3D8 //log(1/frcpa(1+ 239/256) +data8 0x3FE5322E26867857 //log(1/frcpa(1+ 240/256) +data8 0x3FE541B5CB979809 //log(1/frcpa(1+ 241/256) +data8 0x3FE55144FDBCBD62 //log(1/frcpa(1+ 242/256) +data8 0x3FE560DBC45153C7 //log(1/frcpa(1+ 243/256) +data8 0x3FE5707A26BB8C66 //log(1/frcpa(1+ 244/256) +data8 0x3FE587F60ED5B900 //log(1/frcpa(1+ 245/256) +data8 0x3FE597A7977C8F31 //log(1/frcpa(1+ 246/256) +data8 0x3FE5A760D634BB8B //log(1/frcpa(1+ 247/256) +data8 0x3FE5B721D295F10F //log(1/frcpa(1+ 248/256) +data8 0x3FE5C6EA94431EF9 //log(1/frcpa(1+ 249/256) +data8 0x3FE5D6BB22EA86F6 //log(1/frcpa(1+ 250/256) +data8 0x3FE5E6938645D390 //log(1/frcpa(1+ 251/256) +data8 0x3FE5F673C61A2ED2 //log(1/frcpa(1+ 252/256) +data8 0x3FE6065BEA385926 //log(1/frcpa(1+ 253/256) +data8 0x3FE6164BFA7CC06B //log(1/frcpa(1+ 254/256) +data8 0x3FE62643FECF9743 //log(1/frcpa(1+ 255/256) +ASM_SIZE_DIRECTIVE(log_table_2) + + +.align 32 +.global logf# +.global log10f# + +// log10 has p7 true, p8 false +// log has p8 true, p7 false + +.section .text +.proc log10f# +.align 32 + +log10f: +#ifdef _LIBC +.global __ieee754_log10f +.type __ieee754_log10f,@function +__ieee754_log10f: +#endif +{ .mfi + alloc r32=ar.pfs,1,15,4,0 + frcpa.s1 log_C,p9 = f1,f8 + cmp.eq.unc p7,p8 = r0, r0 +} +{ .mfb + addl log_AD_1 = @ltoff(log_table_1), gp + fnorm.s1 log_NORM_f8 = f8 + br.sptk L(LOG_LOG10_X) +} +;; + +.endp log10f +ASM_SIZE_DIRECTIVE(log10f) +ASM_SIZE_DIRECTIVE(__ieee754_log10f) + + + +.section .text +.proc logf# +.align 32 +logf: +#ifdef _LIBC +.global __ieee754_logf +.type __ieee754_logf,@function +__ieee754_logf: +#endif + +{ .mfi + alloc r32=ar.pfs,1,15,4,0 + frcpa.s1 log_C,p9 = f1,f8 + cmp.eq.unc p8,p7 = r0, r0 +} +{ .mfi + addl log_AD_1 = @ltoff(log_table_1), gp + fnorm.s1 log_NORM_f8 = f8 + nop.i 999 +} +;; + +L(LOG_LOG10_X): + +{ .mfi + getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute + fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm + mov log_GR_fff7 = 0xfff7 +} +{ .mfi + ld8 log_AD_1 = [log_AD_1] + fms.s1 log_w = f8,f1,f1 + mov log_GR_exp_17_ones = 0x1ffff +} +;; + +{ .mmi + getf.sig log_GR_significand_f8 = f8 // If x unorm then must recompute + mov log_GR_exp_16_ones = 0xffff + nop.i 999 +} +;; + +{ .mmb + adds log_AD_2 = 0x10, log_AD_1 + and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones +(p15) br.cond.spnt L(LOG_DENORM) +} +;; + +L(LOG_COMMON): +{.mfi + ldfpd log_P3,log_P2 = [log_AD_1],16 + fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan + shl log_GR_index = log_GR_significand_f8,1 +} +{.mfi + sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + ldfpd log_P1,log_inv_ln10 = [log_AD_2],16 + fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf + shr.u log_GR_index = log_GR_index,56 +} +{ .mfi + setf.sig log_int_Nfloat = log_GR_true_exp_f8 + nop.f 999 + nop.i 999 +} +;; + + +{ .mfi + ldfd log_log2 = [log_AD_2],16 + fma.s1 log_wsq = log_w, log_w, f0 + nop.i 999 +} +{ .mfb + nop.m 999 +(p6) fma.s.s0 f8 = f8,f1,f0 // quietize nan result if x=nan +(p6) br.ret.spnt b0 // Exit for x=nan +} +;; + + +{ .mfi + shladd log_AD_2 = log_GR_index,3,log_AD_2 + fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0 + nop.i 999 +} +{ .mfb + nop.m 999 + fms.s1 log_r = log_C,f8,f1 +(p11) br.ret.spnt b0 // Exit for x=+inf +} +;; + + +{ .mmf + nop.m 999 + nop.m 999 + fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +} +;; + + +{ .mfb + ldfd log_T = [log_AD_2] +(p10) fmerge.s f8 = f0, f0 +(p10) br.ret.spnt b0 // Exit for x=1.0 +;; +} + +{ .mfi + getf.exp log_GR_signexp_w = log_w + fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf + nop.i 999 +} +;; + +{ .mmb + nop.m 999 + nop.m 999 +(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0 +;; +} + + +{ .mfi + and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w + nop.f 999 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 log_rsq = log_r, log_r, f0 +(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0 +;; +} + +{ .mfi + nop.m 999 + fma.s1 log_rp_p32 = log_P3, log_r, log_P2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_q32 = log_P3, log_w, log_P2 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.xf log_Nfloat = log_int_Nfloat + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 log_rp_p10 = log_P1, log_r, f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_q10 = log_P1, log_w, f1 + nop.i 999 +;; +} + +// p13 <== large w log +// p14 <== small w log +{ .mfi +(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff7 + fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input + nop.i 999 +;; +} + +// p10 <== large w log10 +// p11 <== small w log10 +{ .mfi +(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff7 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 log_rp_q2 = log_rp_q32, log_wsq, log_rp_q10 + nop.i 999 +;; +} + + +// small w, log <== p14 +{ .mfi + nop.m 999 +(p14) fma.s f8 = log_rp_q2, log_w, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fma.s1 log_Q = log_rp_q2, log_w, f0 + nop.i 999 ;; +} + + +// large w, log <== p13 +.pred.rel "mutex",p13,p10 +{ .mfi + nop.m 999 +(p13) fma.s f8 = log_rp_p2, log_r, log_T_plus_Nlog2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 log_Q = log_rp_p2, log_r, log_T_plus_Nlog2 + nop.i 999 ;; +} + + +// log10 +{ .mfb + nop.m 999 +(p7) fma.s f8 = log_inv_ln10,log_Q,f0 + br.ret.sptk b0 +;; +} + + +L(LOG_DENORM): +{ .mmi + getf.exp log_GR_signexp_f8 = log_NORM_f8 + nop.m 999 + nop.i 999 +} +;; +{ .mmb + getf.sig log_GR_significand_f8 = log_NORM_f8 + and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones + br.cond.sptk L(LOG_COMMON) +} +;; + +L(LOG_ZERO_NEG): + +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 0 1 11 0x7 +// 0 0 1 1 1 0 10 0x3a + +// Save x (f8) in f10 +{ .mfi + nop.m 999 + fmerge.s f10 = f8,f8 + nop.i 999 ;; +} + +// p8 p9 means ln(+-0) = -inf +// p7 p10 means log(+-0) = -inf + +// p13 means ln(-) +// p14 means log(-) + + +{ .mfi + nop.m 999 + fmerge.ns f6 = f1,f1 // Form -1.0 + nop.i 999 ;; +} + +// p9 means ln(+-0) = -inf +// p10 means log(+-0) = -inf +// Log(+-0) = -inf + +{ .mfi + nop.m 999 +(p8) fclass.m.unc p9,p0 = f10, 0x07 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10,p0 = f10, 0x07 + nop.i 999 ;; +} + + +// p13 ln(-) +// p14 log(-) + +// Log(-inf, -normal, -unnormal) = QNAN indefinite +{ .mfi + nop.m 999 +(p8) fclass.m.unc p13,p0 = f10, 0x3a + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p14,p0 = f10, 0x3a + nop.i 999 ;; +} + + +.pred.rel "mutex",p9,p10 +{ .mfi +(p9) mov log_GR_tag = 4 +(p9) frcpa f8,p11 = f6,f0 + nop.i 999 +} +{ .mfi +(p10) mov log_GR_tag = 10 +(p10) frcpa f8,p12 = f6,f0 + nop.i 999 ;; +} + +.pred.rel "mutex",p13,p14 +{ .mfi +(p13) mov log_GR_tag = 5 +(p13) frcpa f8,p11 = f0,f0 + nop.i 999 +} +{ .mfb +(p14) mov log_GR_tag = 11 +(p14) frcpa f8,p12 = f0,f0 + br.cond.sptk __libm_error_region ;; +} +.endp logf +ASM_SIZE_DIRECTIVE(logf) +ASM_SIZE_DIRECTIVE(__ieee754_logf) + + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + + +.proc __libm_error_region +__libm_error_region: +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; + +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_logl.c b/sysdeps/ia64/fpu/e_logl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_logl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_pow.S b/sysdeps/ia64/fpu/e_pow.S new file mode 100644 index 0000000000..acc3ed8371 --- /dev/null +++ b/sysdeps/ia64/fpu/e_pow.S @@ -0,0 +1,2309 @@ +.file "pow.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/03/00 Added p12 to definite over/under path. With odd power we did not +// maintain the sign of x in this path. +// 4/04/00 Unwind support added +// 4/19/00 pow(+-1,inf) now returns NaN +// pow(+-val, +-inf) returns 0 or inf, but now does not call error support +// Added s1 to fcvt.fx because invalid flag was incorrectly set. +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 9/07/00 Improved performance by eliminating bank conflicts and other stalls, +// and tweaking the critical path +// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 +// 9/28/00 Updated NaN**0 path +// 1/20/01 Fixed denormal flag settings. +// 2/12/01 Improved speed. +// +// API +//============================================================== +// double pow(double) +// float powf(float) +// +// Overview of operation +//============================================================== +// +// Three steps... +// 1. Log(x) +// 2. y Log(x) +// 3. exp(y log(x)) +// +// This means we work with the absolute value of x and merge in the sign later. +// Log(x) = G + delta + r -rsq/2 + p +// G,delta depend on the exponent of x and table entries. The table entries are +// indexed by the exponent of x, called K. +// +// The G and delta come out of the reduction; r is the reduced x. +// +// B = frcpa(x) +// xB-1 is small means that B is the approximate inverse of x. +// +// Log(x) = Log( (1/B)(Bx) ) +// = Log(1/B) + Log(Bx) +// = Log(1/B) + Log( 1 + (Bx-1)) +// +// x = 2^K 1.x_1x_2.....x_52 +// B= frcpa(x) = 2^-k Cm +// Log(1/B) = Log(1/(2^-K Cm)) +// Log(1/B) = Log((2^K/ Cm)) +// Log(1/B) = K Log(2) + Log(1/Cm) +// +// Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1)) +// +// If you take the significand of x, set the exponent to true 0, then Cm is +// the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them. +// The frcpa table is indexed by 8 bits, the x_1 thru x_8. +// m = x_1x_2...x_8 is an 8-bit index. +// +// Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255. +// +// We tabluate as two doubles, T and t, where T +t is the value itself. +// +// Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1)) +// Log(x) = G + delta + Log( 1 + (Bx-1)) +// +// The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1. +// +// Log( 1 + (Bx-1)) = r - rsq/2 + p +// +// Then, +// +// yLog(x) = yG + y delta + y(r-rsq/2) + yp +// yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3) +// +// +// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) +// +// +// exp(Z3) is another series. +// exp(e1 + e2 + e3) is approximated as f3 = 1 + (e1 + e2 + e3) +// +// Z1 (128/log2) = number of log2/128 in Z1 is N1 +// Z2 (128/log2) = number of log2/128 in Z2 is N2 +// +// s1 = Z1 - N1 log2/128 +// s2 = Z2 - N2 log2/128 +// +// s = s1 + s2 +// N = N1 + N2 +// +// exp(Z1 + Z2) = exp(Z) +// exp(Z) = exp(s) exp(N log2/128) +// +// exp(r) = exp(Z - N log2/128) +// +// r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo +// = Z - N (log2/128) +// +// Z = s+d +N (log2/128) +// +// exp(Z) = exp(s) (1+d) exp(N log2/128) +// +// N = M 128 + n +// +// N log2/128 = M log2 + n log2/128 +// +// n is 8 binary digits = n_7n_6...n_1 +// +// n log2/128 = n_7n_6n_5 16 log2/128 + n_4n_3n_2n_1 log2/128 +// n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128 +// n log2/128 = I2 log2/8 + I1 log2/128 +// +// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 +// +// exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128)) +// exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128 +// exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128 +// +// I1, I2 are table indices. Use a series for exp(s). +// Then get exp(Z) +// +// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) +// exp(yLog(x)) = exp(Z) exp(Z3) f3 +// exp(yLog(x)) = exp(Z)f3 exp(Z3) +// exp(yLog(x)) = A exp(Z3) +// +// We actually calculate exp(Z3) -1. +// Then, +// exp(yLog(x)) = A + A( exp(Z3) -1) +// + +// Table Generation +//============================================================== + +// The log values +// ============== +// The operation (K*log2_hi) must be exact. K is the true exponent of x. +// If we allow gradual underflow (denormals), K can be represented in 12 bits +// (as a two's complement number). We assume 13 bits as an engineering precaution. +// +// +------------+----------------+-+ +// | 13 bits | 50 bits | | +// +------------+----------------+-+ +// 0 1 66 +// 2 34 +// +// So we want the lsb(log2_hi) to be 2^-50 +// We get log2 as a quad-extended (15-bit exponent, 128-bit significand) +// +// 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...) +// +// Consider numbering the bits left to right, starting at 0 thru 127. +// Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit. +// +// ...79ab +// 0111 1001 1010 1011 +// 44 +// 89 +// +// So if we shift off the rightmost 14 bits, then (shift back only +// the top half) we get +// +// 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000 +// +// Put the right 64-bit signficand in an FR register, convert to double; +// it is exact. Put the next 128 bits into a quad register and round to double. +// The true exponent of the low part is -51. +// +// hi is 0 fffe b17217f7d1cf4000 +// lo is 0 ffcc e6af278ece601000 +// +// Convert to double memory format and get +// +// hi is 0x3fe62e42fefa39e8 +// lo is 0x3cccd5e4f1d9cc02 +// +// log2_hi + log2_lo is an accurate value for log2. +// +// +// The T and t values +// ================== +// A similar method is used to generate the T and t values. +// +// K * log2_hi + T must be exact. +// +// Smallest T,t +// ---------- +// The smallest T,t is +// T t +// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 +// +// The exponent is 0x3f6 (biased) or -9 (true). +// For the smallest T value, what we want is to clip the significand such that +// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific +// for the first entry. In general, it is 0xffff - (biased 15-bit exponent). + +// Independently, what we have calculated is the table value as a quad precision number. +// Table entry 1 is +// 0 fff6 80200aaeac44ef38 338f77605fdf8000 +// +// We store this quad precision number in a data structure that is +// sign: 1 +// exponent: 15 +// signficand_hi: 64 (includes explicit bit) +// signficand_lo: 49 +// Because the explicit bit is included, the significand is 113 bits. +// +// Consider significand_hi for table entry 1. +// +// +// +-+--- ... -------+--------------------+ +// | | +// +-+--- ... -------+--------------------+ +// 0 1 4444444455555555556666 +// 2345678901234567890123 +// +// Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc. +// Bit 42 is 2^-42. If we shift to the right by 9, the bit in +// bit 42 goes in 51. +// +// So what we want to do is shift bits 43 thru 63 into significand_lo. +// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits. +// Then shifting (just with signficaand_hi) back into bit 42. +// +// The shift_value is 63-42 = 21. In general, this is +// 63 - (51 -(0xffff - 0xfff6)) +// For this example, it is +// 63 - (51 - 9) = 63 - 42 = 21 +// +// This means we are shifting 21 bits into significand_lo. We must maintain more +// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit +// significand into a 256-bit signficand and then shift. +// The 256-bit significand has four parts: hh, hl, lh, and ll. +// +// Start off with +// hh hl lh ll +// <64> <49><15_0> <64_0> <64_0> +// +// After shift by 21 (then return for significand_hi), +// <43><21_0> <21><43> <6><58_0> <64_0> +// +// Take the hh part and convert to a double. There is no rounding here. +// The conversion is exact. The true exponent of the high part is the same as the +// true exponent of the input quad. +// +// We have some 64 plus significand bits for the low part. In this example, we have +// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm. +// For this example the true exponent of the low part is +// true_exponent_of_high - 43 = true_exponent_of_high - (64-21) +// In general, this is +// true_exponent_of_high - (64 - shift_value) +// +// +// Largest T,t +// ---------- +// The largest T,t is +// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001 +// +// Table entry 256 is +// 0 fffe b1321ff67cba178c 51da12f4df5a0000 +// +// The shift value is +// 63 - (51 -(0xffff - 0xfffe)) = 13 +// +// The true exponent of the low part is +// true_exponent_of_high - (64 - shift_value) +// -1 - (64-13) = -52 +// Biased as a double, this is 0x3cb +// +// +// +// So then lsb(T) must be >= 2^-51 +// msb(Klog2_hi) <= 2^12 +// +// +--------+---------+ +// | 51 bits | <== largest T +// +--------+---------+ +// | 9 bits | 42 bits | <== smallest T +// +------------+----------------+-+ +// | 13 bits | 50 bits | | +// +------------+----------------+-+ + + + +// Special Cases +//============================================================== + +// double float +// overflow error 24 30 + +// underflow error 25 31 + +// X zero Y zero +// +0 +0 +1 error 26 32 +// -0 +0 +1 error 26 32 +// +0 -0 +1 error 26 32 +// -0 -0 +1 error 26 32 + +// X zero Y negative +// +0 -odd integer +inf error 27 33 divide-by-zero +// -0 -odd integer -inf error 27 33 divide-by-zero +// +0 !-odd integer +inf error 27 33 divide-by-zero +// -0 !-odd integer +inf error 27 33 divide-by-zero +// +0 -inf +inf error 27 33 divide-by-zero +// -0 -inf +inf error 27 33 divide-by-zero + +// X zero Y positve +// +0 +odd integer +0 +// -0 +odd integer -0 +// +0 !+odd integer +0 +// -0 !+odd integer +0 +// +0 +inf +0 +// -0 +inf +0 +// +0 Y NaN quiet Y invalid if Y SNaN +// -0 Y NaN quiet Y invalid if Y SNaN + +// X one +// -1 Y inf +1 +// -1 Y NaN quiet Y invalid if Y SNaN +// +1 Y NaN +1 invalid if Y SNaN +// +1 Y any else +1 + +// X - Y not integer QNAN error 28 34 invalid + +// X NaN Y 0 +1 error 29 35 +// X NaN Y NaN quiet X invalid if X or Y SNaN +// X NaN Y any else quiet X invalid if X SNaN +// X !+1 Y NaN quiet Y invalid if Y SNaN + + +// X +inf Y >0 +inf +// X -inf Y >0, !odd integer +inf +// X -inf Y >0, odd integer -inf + +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 + +// X +inf Y =0 +1 +// X -inf Y =0 +1 + +// |X|<1 Y +inf +0 +// |X|<1 Y -inf +inf +// |X|>1 Y +inf +inf +// |X|>1 Y -inf +0 + +// X any Y =0 +1 + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +// integer registers used + +pow_AD_Tt = r33 +pow_GR_FFF7 = r34 +pow_GR_exp_Y = r34 // duplicate +pow_GR_17ones = r35 + +pow_AD_P = r36 +pow_AD_Q = r37 +pow_AD_tbl1 = r38 +pow_AD_tbl2 = r39 +pow_GR_exp_X = r40 +pow_GR_true_exp_X = r40 // duplicate + +pow_GR_offset = r41 +pow_GR_exp_Xm1 = r42 +pow_GR_sig_X = r43 +pow_GR_signexp_X = r44 + +pow_GR_signexp_Xm1 = r46 +pow_GR_int_W1 = r47 +pow_GR_int_W2 = r48 +pow_GR_int_N = r49 +pow_GR_index1 = r50 + +pow_GR_index2 = r51 +pow_AD_T1 = r52 +pow_AD_T2 = r53 +pow_GR_gt_ln = r53 // duplicate +pow_int_GR_M = r54 +pow_GR_10033 = r55 + +pow_GR_16ones = r56 +pow_GR_sig_int_Y = r57 +pow_GR_sign_Y_Gpr = r58 +pow_GR_17ones_m1 = r59 +pow_GR_one = r60 +pow_GR_sign_Y = r60 + +pow_GR_signexp_Y_Gpr = r61 +pow_GR_exp_Y_Gpr = r62 +pow_GR_true_exp_Y_Gpr = r63 +pow_GR_signexp_Y = r64 + +GR_SAVE_B0 = r65 +GR_SAVE_GP = r66 +GR_SAVE_PFS = r67 + +GR_Parameter_X = r68 +GR_Parameter_Y = r69 +GR_Parameter_RESULT = r70 +pow_GR_tag = r71 + + +// floating point registers used + +POW_B = f32 +POW_NORM_X = f33 +POW_Xm1 = f34 +POW_r1 = f34 +POW_P4 = f35 + +POW_P5 = f36 +POW_NORM_Y = f37 +POW_Q2 = f38 +POW_Q3 = f39 +POW_P2 = f40 + +POW_P3 = f41 +POW_P0 = f42 +POW_log2_lo = f43 +POW_r = f44 +POW_Q0_half = f45 + +POW_Q1 = f46 +POW_log2_hi = f48 +POW_Q4 = f49 +POW_P1 = f50 + +POW_log2_by_128_hi = f51 +POW_inv_log2_by_128 = f52 +POW_rsq = f53 +POW_Yrcub = f54 +POW_log2_by_128_lo = f55 + +POW_v6 = f56 +POW_v4 = f58 +POW_v2 = f59 +POW_T = f60 + +POW_Tt = f61 +POW_RSHF = f62 +POW_v21ps = f63 +POW_s4 = f64 + +POW_U = f66 +POW_G = f67 +POW_delta = f68 +POW_v3 = f69 +POW_V = f70 + +POW_p = f71 +POW_Z1 = f72 +POW_e3 = f73 +POW_e2 = f74 +POW_Z2 = f75 + +POW_e1 = f76 +POW_W1 = f77 +POW_UmZ2 = f78 +POW_W2 = f79 +POW_Z3 = f80 + +POW_int_W1 = f81 +POW_e12 = f82 +POW_int_W2 = f83 +POW_UmZ2pV = f84 +POW_Z3sq = f85 + +POW_e123 = f86 +POW_N1float = f87 +POW_N2float = f88 +POW_f3 = f89 +POW_q = f90 + +POW_s1 = f91 +POW_Nfloat = f92 +POW_s2 = f93 +POW_f2 = f94 +POW_f1 = f95 + +POW_T1 = f96 +POW_T2 = f97 +POW_2M = f98 +POW_s = f99 +POW_f12 = f100 + +POW_ssq = f101 +POW_T1T2 = f102 +POW_1ps = f103 +POW_A = f104 +POW_es = f105 + +POW_int_K = f107 +POW_K = f108 +POW_f123 = f109 +POW_Gpr = f110 + +POW_Y_Gpr = f111 +POW_int_Y = f112 + +POW_float_int_Y = f116 +POW_ftz_urm_f8 = f117 +POW_wre_urm_f8 = f118 +POW_abs_A = f119 +POW_gt_pln = f120 + +POW_xsq = f121 + +POW_twoV = f122 +POW_Xp1 = f123 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +pow_table_P: +ASM_TYPE_DIRECTIVE(pow_table_P,@object) +data8 0x8000F7B249FF332D, 0x0000BFFC // P_5 +data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3 +data8 0x80000000000018E5, 0x0000BFFD // P_1 +data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128 + + +data8 0x3FA5555555554A9E // Q_2 +data8 0x3F8111124F4DD9F9 // Q_3 +data8 0x3FE0000000000000 // Q_0 +data8 0x3FC5555555554733 // Q_1 +data8 0x3F56C16D9360FFA0 // Q_4 +data8 0x43e8000000000000 // Right shift constant for exp +data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo +data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q +data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q +ASM_SIZE_DIRECTIVE(pow_table_P) + +pow_table_Q: +ASM_TYPE_DIRECTIVE(pow_table_Q,@object) +data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4 +data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2 +data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0 +data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001 +data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi +ASM_SIZE_DIRECTIVE(pow_table_Q) + + +pow_Tt: +ASM_TYPE_DIRECTIVE(pow_Tt,@object) +data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003 +data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003 +data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003 +data8 0x3f8c317384c75f00, 0x3c69806208c04c22 // log(1/frcpa(1+3/256))= +1.37662e-002 +data8 0x3f91a6b91ac73380, 0x3c7874daa716eb32 // log(1/frcpa(1+4/256))= +1.72376e-002 +data8 0x3f95ba9a5d9ac000, 0x3cacbb84e08d78ac // log(1/frcpa(1+5/256))= +2.12196e-002 +data8 0x3f99d2a807432580, 0x3cbcf80538b441e1 // log(1/frcpa(1+6/256))= +2.52177e-002 +data8 0x3f9d6b2725979800, 0x3c6095e5c8f8f359 // log(1/frcpa(1+7/256))= +2.87291e-002 +data8 0x3fa0c58fa19dfa80, 0x3cb4c5d4e9d0dda2 // log(1/frcpa(1+8/256))= +3.27573e-002 +data8 0x3fa2954c78cbce00, 0x3caa932b860ab8d6 // log(1/frcpa(1+9/256))= +3.62953e-002 +data8 0x3fa4a94d2da96c40, 0x3ca670452b76bbd5 // log(1/frcpa(1+10/256))= +4.03542e-002 +data8 0x3fa67c94f2d4bb40, 0x3ca84104f9941798 // log(1/frcpa(1+11/256))= +4.39192e-002 +data8 0x3fa85188b630f040, 0x3cb40a882cbf0153 // log(1/frcpa(1+12/256))= +4.74971e-002 +data8 0x3faa6b8abe73af40, 0x3c988d46e25c9059 // log(1/frcpa(1+13/256))= +5.16017e-002 +data8 0x3fac441e06f72a80, 0x3cae3e930a1a2a96 // log(1/frcpa(1+14/256))= +5.52072e-002 +data8 0x3fae1e6713606d00, 0x3c8a796f6283b580 // log(1/frcpa(1+15/256))= +5.88257e-002 +data8 0x3faffa6911ab9300, 0x3c5193070351e88a // log(1/frcpa(1+16/256))= +6.24574e-002 +data8 0x3fb0ec139c5da600, 0x3c623f2a75eb992d // log(1/frcpa(1+17/256))= +6.61022e-002 +data8 0x3fb1dbd2643d1900, 0x3ca649b2ef8927f0 // log(1/frcpa(1+18/256))= +6.97605e-002 +data8 0x3fb2cc7284fe5f00, 0x3cbc5e86599513e2 // log(1/frcpa(1+19/256))= +7.34321e-002 +data8 0x3fb3bdf5a7d1ee60, 0x3c90bd4bb69dada3 // log(1/frcpa(1+20/256))= +7.71173e-002 +data8 0x3fb4b05d7aa012e0, 0x3c54e377c9b8a54f // log(1/frcpa(1+21/256))= +8.08161e-002 +data8 0x3fb580db7ceb5700, 0x3c7fdb2f98354cde // log(1/frcpa(1+22/256))= +8.39975e-002 +data8 0x3fb674f089365a60, 0x3cb9994c9d3301c1 // log(1/frcpa(1+23/256))= +8.77219e-002 +data8 0x3fb769ef2c6b5680, 0x3caaec639db52a79 // log(1/frcpa(1+24/256))= +9.14602e-002 +data8 0x3fb85fd927506a40, 0x3c9f9f99a3cf8e25 // log(1/frcpa(1+25/256))= +9.52125e-002 +data8 0x3fb9335e5d594980, 0x3ca15c3abd47d99a // log(1/frcpa(1+26/256))= +9.84401e-002 +data8 0x3fba2b0220c8e5e0, 0x3cb4ca639adf6fc3 // log(1/frcpa(1+27/256))= +1.02219e-001 +data8 0x3fbb0004ac1a86a0, 0x3ca7cb81bf959a59 // log(1/frcpa(1+28/256))= +1.05469e-001 +data8 0x3fbbf968769fca00, 0x3cb0c646c121418e // log(1/frcpa(1+29/256))= +1.09274e-001 +data8 0x3fbccfedbfee13a0, 0x3ca0465fce24ab4b // log(1/frcpa(1+30/256))= +1.12548e-001 +data8 0x3fbda727638446a0, 0x3c82803f4e2e6603 // log(1/frcpa(1+31/256))= +1.15832e-001 +data8 0x3fbea3257fe10f60, 0x3cb986a3f2313d1a // log(1/frcpa(1+32/256))= +1.19677e-001 +data8 0x3fbf7be9fedbfde0, 0x3c97d16a6a621cf4 // log(1/frcpa(1+33/256))= +1.22985e-001 +data8 0x3fc02ab352ff25f0, 0x3c9cc6baad365600 // log(1/frcpa(1+34/256))= +1.26303e-001 +data8 0x3fc097ce579d2040, 0x3cb9ba16d329440b // log(1/frcpa(1+35/256))= +1.29633e-001 +data8 0x3fc1178e8227e470, 0x3cb7bc671683f8e6 // log(1/frcpa(1+36/256))= +1.33531e-001 +data8 0x3fc185747dbecf30, 0x3c9d1116f66d2345 // log(1/frcpa(1+37/256))= +1.36885e-001 +data8 0x3fc1f3b925f25d40, 0x3c8162c9ef939ac6 // log(1/frcpa(1+38/256))= +1.40250e-001 +data8 0x3fc2625d1e6ddf50, 0x3caad3a1ec384fc3 // log(1/frcpa(1+39/256))= +1.43627e-001 +data8 0x3fc2d1610c868130, 0x3cb3ad997036941b // log(1/frcpa(1+40/256))= +1.47015e-001 +data8 0x3fc340c597411420, 0x3cbc2308262c7998 // log(1/frcpa(1+41/256))= +1.50414e-001 +data8 0x3fc3b08b6757f2a0, 0x3cb2170d6cdf0526 // log(1/frcpa(1+42/256))= +1.53825e-001 +data8 0x3fc40dfb08378000, 0x3c9bb453c4f7b685 // log(1/frcpa(1+43/256))= +1.56677e-001 +data8 0x3fc47e74e8ca5f70, 0x3cb836a48fdfce9d // log(1/frcpa(1+44/256))= +1.60109e-001 +data8 0x3fc4ef51f6466de0, 0x3ca07a43919aa64b // log(1/frcpa(1+45/256))= +1.63553e-001 +data8 0x3fc56092e02ba510, 0x3ca85006899d97b0 // log(1/frcpa(1+46/256))= +1.67010e-001 +data8 0x3fc5d23857cd74d0, 0x3ca30a5ba6e7abbe // log(1/frcpa(1+47/256))= +1.70478e-001 +data8 0x3fc6313a37335d70, 0x3ca905586f0ac97e // log(1/frcpa(1+48/256))= +1.73377e-001 +data8 0x3fc6a399dabbd380, 0x3c9b2c6657a96684 // log(1/frcpa(1+49/256))= +1.76868e-001 +data8 0x3fc70337dd3ce410, 0x3cb50bc52f55cdd8 // log(1/frcpa(1+50/256))= +1.79786e-001 +data8 0x3fc77654128f6120, 0x3cad2eb7c9a39efe // log(1/frcpa(1+51/256))= +1.83299e-001 +data8 0x3fc7e9d82a0b0220, 0x3cba127e90393c01 // log(1/frcpa(1+52/256))= +1.86824e-001 +data8 0x3fc84a6b759f5120, 0x3cbd7fd52079f706 // log(1/frcpa(1+53/256))= +1.89771e-001 +data8 0x3fc8ab47d5f5a300, 0x3cbfae141751a3de // log(1/frcpa(1+54/256))= +1.92727e-001 +data8 0x3fc91fe490965810, 0x3cb69cf30a1c319e // log(1/frcpa(1+55/256))= +1.96286e-001 +data8 0x3fc981634011aa70, 0x3ca5bb3d208bc42a // log(1/frcpa(1+56/256))= +1.99261e-001 +data8 0x3fc9f6c407089660, 0x3ca04d68658179a0 // log(1/frcpa(1+57/256))= +2.02843e-001 +data8 0x3fca58e729348f40, 0x3c99f5411546c286 // log(1/frcpa(1+58/256))= +2.05838e-001 +data8 0x3fcabb55c31693a0, 0x3cb9a5350eb327d5 // log(1/frcpa(1+59/256))= +2.08842e-001 +data8 0x3fcb1e104919efd0, 0x3c18965fcce7c406 // log(1/frcpa(1+60/256))= +2.11855e-001 +data8 0x3fcb94ee93e367c0, 0x3cb503716da45184 // log(1/frcpa(1+61/256))= +2.15483e-001 +data8 0x3fcbf851c0675550, 0x3cbdf1b3f7ab5378 // log(1/frcpa(1+62/256))= +2.18516e-001 +data8 0x3fcc5c0254bf23a0, 0x3ca7aab9ed0b1d7b // log(1/frcpa(1+63/256))= +2.21558e-001 +data8 0x3fccc000c9db3c50, 0x3c92a7a2a850072a // log(1/frcpa(1+64/256))= +2.24609e-001 +data8 0x3fcd244d99c85670, 0x3c9f6019120edf4c // log(1/frcpa(1+65/256))= +2.27670e-001 +data8 0x3fcd88e93fb2f450, 0x3c6affb96815e081 // log(1/frcpa(1+66/256))= +2.30741e-001 +data8 0x3fcdedd437eaef00, 0x3c72553595897976 // log(1/frcpa(1+67/256))= +2.33820e-001 +data8 0x3fce530effe71010, 0x3c90913b020fa182 // log(1/frcpa(1+68/256))= +2.36910e-001 +data8 0x3fceb89a1648b970, 0x3c837ba4045bfd25 // log(1/frcpa(1+69/256))= +2.40009e-001 +data8 0x3fcf1e75fadf9bd0, 0x3cbcea6d13e0498d // log(1/frcpa(1+70/256))= +2.43117e-001 +data8 0x3fcf84a32ead7c30, 0x3ca5e3a67b3c6d77 // log(1/frcpa(1+71/256))= +2.46235e-001 +data8 0x3fcfeb2233ea07c0, 0x3cba0c6f0049c5a6 // log(1/frcpa(1+72/256))= +2.49363e-001 +data8 0x3fd028f9c7035c18, 0x3cb0a30b06677ff6 // log(1/frcpa(1+73/256))= +2.52501e-001 +data8 0x3fd05c8be0d96358, 0x3ca0f1c77ccb5865 // log(1/frcpa(1+74/256))= +2.55649e-001 +data8 0x3fd085eb8f8ae790, 0x3cbd513f45fe7a97 // log(1/frcpa(1+75/256))= +2.58174e-001 +data8 0x3fd0b9c8e32d1910, 0x3c927449047ca006 // log(1/frcpa(1+76/256))= +2.61339e-001 +data8 0x3fd0edd060b78080, 0x3c89b52d8435f53e // log(1/frcpa(1+77/256))= +2.64515e-001 +data8 0x3fd122024cf00638, 0x3cbdd976fabda4bd // log(1/frcpa(1+78/256))= +2.67701e-001 +data8 0x3fd14be2927aecd0, 0x3cb02f90ad0bc471 // log(1/frcpa(1+79/256))= +2.70257e-001 +data8 0x3fd180618ef18ad8, 0x3cbd003792c71a98 // log(1/frcpa(1+80/256))= +2.73461e-001 +data8 0x3fd1b50bbe2fc638, 0x3ca9ae64c6403ead // log(1/frcpa(1+81/256))= +2.76675e-001 +data8 0x3fd1df4cc7cf2428, 0x3cb43f0455f7e395 // log(1/frcpa(1+82/256))= +2.79254e-001 +data8 0x3fd214456d0eb8d0, 0x3cb0fbd748d75d30 // log(1/frcpa(1+83/256))= +2.82487e-001 +data8 0x3fd23ec5991eba48, 0x3c906edd746b77e2 // log(1/frcpa(1+84/256))= +2.85081e-001 +data8 0x3fd2740d9f870af8, 0x3ca9802e6a00a670 // log(1/frcpa(1+85/256))= +2.88333e-001 +data8 0x3fd29ecdabcdfa00, 0x3cacecef70890cfa // log(1/frcpa(1+86/256))= +2.90943e-001 +data8 0x3fd2d46602adcce8, 0x3cb97911955f3521 // log(1/frcpa(1+87/256))= +2.94214e-001 +data8 0x3fd2ff66b04ea9d0, 0x3cb12dabe191d1c9 // log(1/frcpa(1+88/256))= +2.96838e-001 +data8 0x3fd335504b355a30, 0x3cbdf9139df924ec // log(1/frcpa(1+89/256))= +3.00129e-001 +data8 0x3fd360925ec44f58, 0x3cb253e68977a1e3 // log(1/frcpa(1+90/256))= +3.02769e-001 +data8 0x3fd38bf1c3337e70, 0x3cb3d283d2a2da21 // log(1/frcpa(1+91/256))= +3.05417e-001 +data8 0x3fd3c25277333180, 0x3cadaa5b035eae27 // log(1/frcpa(1+92/256))= +3.08735e-001 +data8 0x3fd3edf463c16838, 0x3cb983d680d3c108 // log(1/frcpa(1+93/256))= +3.11399e-001 +data8 0x3fd419b423d5e8c0, 0x3cbc86dd921c139d // log(1/frcpa(1+94/256))= +3.14069e-001 +data8 0x3fd44591e0539f48, 0x3c86a76d6dc2782e // log(1/frcpa(1+95/256))= +3.16746e-001 +data8 0x3fd47c9175b6f0a8, 0x3cb59a2e013c6b5f // log(1/frcpa(1+96/256))= +3.20103e-001 +data8 0x3fd4a8b341552b08, 0x3c93f1e86e468694 // log(1/frcpa(1+97/256))= +3.22797e-001 +data8 0x3fd4d4f390890198, 0x3cbf5e4ea7c5105a // log(1/frcpa(1+98/256))= +3.25498e-001 +data8 0x3fd501528da1f960, 0x3cbf58da53e9ad10 // log(1/frcpa(1+99/256))= +3.28206e-001 +data8 0x3fd52dd06347d4f0, 0x3cb98a28cebf6eef // log(1/frcpa(1+100/256))= +3.30921e-001 +data8 0x3fd55a6d3c7b8a88, 0x3c9c76b67c2d1fd4 // log(1/frcpa(1+101/256))= +3.33644e-001 +data8 0x3fd5925d2b112a58, 0x3c9029616a4331b8 // log(1/frcpa(1+102/256))= +3.37058e-001 +data8 0x3fd5bf406b543db0, 0x3c9fb8292ecfc820 // log(1/frcpa(1+103/256))= +3.39798e-001 +data8 0x3fd5ec433d5c35a8, 0x3cb71a1229d17eec // log(1/frcpa(1+104/256))= +3.42545e-001 +data8 0x3fd61965cdb02c18, 0x3cbba94fe1dbb8d2 // log(1/frcpa(1+105/256))= +3.45300e-001 +data8 0x3fd646a84935b2a0, 0x3c9ee496d2c9ae57 // log(1/frcpa(1+106/256))= +3.48063e-001 +data8 0x3fd6740add31de90, 0x3cb1da3a6c7a9dfd // log(1/frcpa(1+107/256))= +3.50833e-001 +data8 0x3fd6a18db74a58c0, 0x3cb494c257add8dc // log(1/frcpa(1+108/256))= +3.53610e-001 +data8 0x3fd6cf31058670e8, 0x3cb0b244a70a8da9 // log(1/frcpa(1+109/256))= +3.56396e-001 +data8 0x3fd6f180e852f0b8, 0x3c9db7aefa866720 // log(1/frcpa(1+110/256))= +3.58490e-001 +data8 0x3fd71f5d71b894e8, 0x3cbe91c4bf324957 // log(1/frcpa(1+111/256))= +3.61289e-001 +data8 0x3fd74d5aefd66d58, 0x3cb06b3d9bfac023 // log(1/frcpa(1+112/256))= +3.64096e-001 +data8 0x3fd77b79922bd378, 0x3cb727d8804491f4 // log(1/frcpa(1+113/256))= +3.66911e-001 +data8 0x3fd7a9b9889f19e0, 0x3ca2ef22df5bc543 // log(1/frcpa(1+114/256))= +3.69734e-001 +data8 0x3fd7d81b037eb6a0, 0x3cb8fd3ba07a7ece // log(1/frcpa(1+115/256))= +3.72565e-001 +data8 0x3fd8069e33827230, 0x3c8bd1e25866e61a // log(1/frcpa(1+116/256))= +3.75404e-001 +data8 0x3fd82996d3ef8bc8, 0x3ca5aab9f5928928 // log(1/frcpa(1+117/256))= +3.77538e-001 +data8 0x3fd85855776dcbf8, 0x3ca56f33337789d6 // log(1/frcpa(1+118/256))= +3.80391e-001 +data8 0x3fd8873658327cc8, 0x3cbb8ef0401db49d // log(1/frcpa(1+119/256))= +3.83253e-001 +data8 0x3fd8aa75973ab8c8, 0x3cbb9961f509a680 // log(1/frcpa(1+120/256))= +3.85404e-001 +data8 0x3fd8d992dc8824e0, 0x3cb220512a53732d // log(1/frcpa(1+121/256))= +3.88280e-001 +data8 0x3fd908d2ea7d9510, 0x3c985f0e513bfb5c // log(1/frcpa(1+122/256))= +3.91164e-001 +data8 0x3fd92c59e79c0e50, 0x3cb82e073fd30d63 // log(1/frcpa(1+123/256))= +3.93332e-001 +data8 0x3fd95bd750ee3ed0, 0x3ca4aa7cdb6dd8a8 // log(1/frcpa(1+124/256))= +3.96231e-001 +data8 0x3fd98b7811a3ee58, 0x3caa93a5b660893e // log(1/frcpa(1+125/256))= +3.99138e-001 +data8 0x3fd9af47f33d4068, 0x3cac294b3b3190ba // log(1/frcpa(1+126/256))= +4.01323e-001 +data8 0x3fd9df270c1914a0, 0x3cbe1a58fd0cd67e // log(1/frcpa(1+127/256))= +4.04245e-001 +data8 0x3fda0325ed14fda0, 0x3cb1efa7950fb57e // log(1/frcpa(1+128/256))= +4.06442e-001 +data8 0x3fda33440224fa78, 0x3c8915fe75e7d477 // log(1/frcpa(1+129/256))= +4.09379e-001 +data8 0x3fda57725e80c380, 0x3ca72bd1062b1b7f // log(1/frcpa(1+130/256))= +4.11587e-001 +data8 0x3fda87d0165dd198, 0x3c91f7845f58dbad // log(1/frcpa(1+131/256))= +4.14539e-001 +data8 0x3fdaac2e6c03f890, 0x3cb6f237a911c509 // log(1/frcpa(1+132/256))= +4.16759e-001 +data8 0x3fdadccc6fdf6a80, 0x3c90ddc4b7687169 // log(1/frcpa(1+133/256))= +4.19726e-001 +data8 0x3fdb015b3eb1e790, 0x3c692dd7d90e1e8e // log(1/frcpa(1+134/256))= +4.21958e-001 +data8 0x3fdb323a3a635948, 0x3c6f85655cbe14de // log(1/frcpa(1+135/256))= +4.24941e-001 +data8 0x3fdb56fa04462908, 0x3c95252d841994de // log(1/frcpa(1+136/256))= +4.27184e-001 +data8 0x3fdb881aa659bc90, 0x3caa53a745a3642f // log(1/frcpa(1+137/256))= +4.30182e-001 +data8 0x3fdbad0bef3db160, 0x3cb32f2540dcc16a // log(1/frcpa(1+138/256))= +4.32437e-001 +data8 0x3fdbd21297781c28, 0x3cbd8e891e106f1d // log(1/frcpa(1+139/256))= +4.34697e-001 +data8 0x3fdc039236f08818, 0x3c809435af522ba7 // log(1/frcpa(1+140/256))= +4.37718e-001 +data8 0x3fdc28cb1e4d32f8, 0x3cb3944752fbd81e // log(1/frcpa(1+141/256))= +4.39990e-001 +data8 0x3fdc4e19b84723c0, 0x3c9a465260cd3fe5 // log(1/frcpa(1+142/256))= +4.42267e-001 +data8 0x3fdc7ff9c74554c8, 0x3c92447d5b6ca369 // log(1/frcpa(1+143/256))= +4.45311e-001 +data8 0x3fdca57b64e9db00, 0x3cb44344a8a00c82 // log(1/frcpa(1+144/256))= +4.47600e-001 +data8 0x3fdccb130a5ceba8, 0x3cbefaddfb97b73f // log(1/frcpa(1+145/256))= +4.49895e-001 +data8 0x3fdcf0c0d18f3268, 0x3cbd3e7bfee57898 // log(1/frcpa(1+146/256))= +4.52194e-001 +data8 0x3fdd232075b5a200, 0x3c9222599987447c // log(1/frcpa(1+147/256))= +4.55269e-001 +data8 0x3fdd490246defa68, 0x3cabafe9a767a80d // log(1/frcpa(1+148/256))= +4.57581e-001 +data8 0x3fdd6efa918d25c8, 0x3cb58a2624e1c6fd // log(1/frcpa(1+149/256))= +4.59899e-001 +data8 0x3fdd9509707ae528, 0x3cbdc3babce578e7 // log(1/frcpa(1+150/256))= +4.62221e-001 +data8 0x3fddbb2efe92c550, 0x3cb0ac0943c434a4 // log(1/frcpa(1+151/256))= +4.64550e-001 +data8 0x3fddee2f3445e4a8, 0x3cbba9d07ce820e8 // log(1/frcpa(1+152/256))= +4.67663e-001 +data8 0x3fde148a1a2726c8, 0x3cb6537e3375b205 // log(1/frcpa(1+153/256))= +4.70004e-001 +data8 0x3fde3afc0a49ff38, 0x3cbfed5518dbc20e // log(1/frcpa(1+154/256))= +4.72350e-001 +data8 0x3fde6185206d5168, 0x3cb6572601f73d5c // log(1/frcpa(1+155/256))= +4.74702e-001 +data8 0x3fde882578823d50, 0x3c9b24abd4584d1a // log(1/frcpa(1+156/256))= +4.77060e-001 +data8 0x3fdeaedd2eac9908, 0x3cb0ceb5e4d2c8f7 // log(1/frcpa(1+157/256))= +4.79423e-001 +data8 0x3fded5ac5f436be0, 0x3ca72f21f1f5238e // log(1/frcpa(1+158/256))= +4.81792e-001 +data8 0x3fdefc9326d16ab8, 0x3c85081a1639a45c // log(1/frcpa(1+159/256))= +4.84166e-001 +data8 0x3fdf2391a21575f8, 0x3cbf11015bdd297a // log(1/frcpa(1+160/256))= +4.86546e-001 +data8 0x3fdf4aa7ee031928, 0x3cb3795bc052a2d1 // log(1/frcpa(1+161/256))= +4.88932e-001 +data8 0x3fdf71d627c30bb0, 0x3c35c61f0f5a88f3 // log(1/frcpa(1+162/256))= +4.91323e-001 +data8 0x3fdf991c6cb3b378, 0x3c97d99419be6028 // log(1/frcpa(1+163/256))= +4.93720e-001 +data8 0x3fdfc07ada69a908, 0x3cbfe9341ded70b1 // log(1/frcpa(1+164/256))= +4.96123e-001 +data8 0x3fdfe7f18eb03d38, 0x3cb85718a640c33f // log(1/frcpa(1+165/256))= +4.98532e-001 +data8 0x3fe007c053c5002c, 0x3cb3addc9c065f09 // log(1/frcpa(1+166/256))= +5.00946e-001 +data8 0x3fe01b942198a5a0, 0x3c9d5aa4c77da6ac // log(1/frcpa(1+167/256))= +5.03367e-001 +data8 0x3fe02f74400c64e8, 0x3cb5a0ee4450ef52 // log(1/frcpa(1+168/256))= +5.05793e-001 +data8 0x3fe04360be7603ac, 0x3c9dd00c35630fe0 // log(1/frcpa(1+169/256))= +5.08225e-001 +data8 0x3fe05759ac47fe30, 0x3cbd063e1f0bd82c // log(1/frcpa(1+170/256))= +5.10663e-001 +data8 0x3fe06b5f1911cf50, 0x3cae8da674af5289 // log(1/frcpa(1+171/256))= +5.13107e-001 +data8 0x3fe078bf0533c568, 0x3c62241edf5fd1f7 // log(1/frcpa(1+172/256))= +5.14740e-001 +data8 0x3fe08cd9687e7b0c, 0x3cb3007febcca227 // log(1/frcpa(1+173/256))= +5.17194e-001 +data8 0x3fe0a10074cf9018, 0x3ca496e84603816b // log(1/frcpa(1+174/256))= +5.19654e-001 +data8 0x3fe0b5343a234474, 0x3cb46098d14fc90a // log(1/frcpa(1+175/256))= +5.22120e-001 +data8 0x3fe0c974c89431cc, 0x3cac0a7cdcbb86c6 // log(1/frcpa(1+176/256))= +5.24592e-001 +data8 0x3fe0ddc2305b9884, 0x3cb2f753210410ff // log(1/frcpa(1+177/256))= +5.27070e-001 +data8 0x3fe0eb524bafc918, 0x3c88affd6682229e // log(1/frcpa(1+178/256))= +5.28726e-001 +data8 0x3fe0ffb54213a474, 0x3cadeefbab9af993 // log(1/frcpa(1+179/256))= +5.31214e-001 +data8 0x3fe114253da97d9c, 0x3cbaf1c2b8bc160a // log(1/frcpa(1+180/256))= +5.33709e-001 +data8 0x3fe128a24f1d9afc, 0x3cb9cf4df375e650 // log(1/frcpa(1+181/256))= +5.36210e-001 +data8 0x3fe1365252bf0864, 0x3c985a621d4be111 // log(1/frcpa(1+182/256))= +5.37881e-001 +data8 0x3fe14ae558b4a92c, 0x3ca104c4aa8977d1 // log(1/frcpa(1+183/256))= +5.40393e-001 +data8 0x3fe15f85a19c7658, 0x3cbadf26e540f375 // log(1/frcpa(1+184/256))= +5.42910e-001 +data8 0x3fe16d4d38c119f8, 0x3cb3aea11caec416 // log(1/frcpa(1+185/256))= +5.44592e-001 +data8 0x3fe18203c20dd130, 0x3cba82d1211d1d6d // log(1/frcpa(1+186/256))= +5.47121e-001 +data8 0x3fe196c7bc4b1f38, 0x3cb6267acc4f4f4a // log(1/frcpa(1+187/256))= +5.49656e-001 +data8 0x3fe1a4a738b7a33c, 0x3c858930213c987d // log(1/frcpa(1+188/256))= +5.51349e-001 +data8 0x3fe1b981c0c9653c, 0x3c9bc2a4a30f697b // log(1/frcpa(1+189/256))= +5.53895e-001 +data8 0x3fe1ce69e8bb1068, 0x3cb7ae6199cf2a00 // log(1/frcpa(1+190/256))= +5.56447e-001 +data8 0x3fe1dc619de06944, 0x3c6b50bb38388177 // log(1/frcpa(1+191/256))= +5.58152e-001 +data8 0x3fe1f160a2ad0da0, 0x3cbd05b2778a5e1d // log(1/frcpa(1+192/256))= +5.60715e-001 +data8 0x3fe2066d7740737c, 0x3cb32e828f9c6bd6 // log(1/frcpa(1+193/256))= +5.63285e-001 +data8 0x3fe2147dba47a390, 0x3cbd579851b8b672 // log(1/frcpa(1+194/256))= +5.65001e-001 +data8 0x3fe229a1bc5ebac0, 0x3cbb321be5237ce8 // log(1/frcpa(1+195/256))= +5.67582e-001 +data8 0x3fe237c1841a502c, 0x3cb3b56e0915ea64 // log(1/frcpa(1+196/256))= +5.69306e-001 +data8 0x3fe24cfce6f80d98, 0x3cb34a4d1a422919 // log(1/frcpa(1+197/256))= +5.71898e-001 +data8 0x3fe25b2c55cd5760, 0x3cb237401ea5015e // log(1/frcpa(1+198/256))= +5.73630e-001 +data8 0x3fe2707f4d5f7c40, 0x3c9d30f20acc8341 // log(1/frcpa(1+199/256))= +5.76233e-001 +data8 0x3fe285e0842ca380, 0x3cbc4d866d5f21c0 // log(1/frcpa(1+200/256))= +5.78842e-001 +data8 0x3fe294294708b770, 0x3cb85e14d5dc54fa // log(1/frcpa(1+201/256))= +5.80586e-001 +data8 0x3fe2a9a2670aff0c, 0x3c7e6f8f468bbf91 // log(1/frcpa(1+202/256))= +5.83207e-001 +data8 0x3fe2b7fb2c8d1cc0, 0x3c930ffcf63c8b65 // log(1/frcpa(1+203/256))= +5.84959e-001 +data8 0x3fe2c65a6395f5f4, 0x3ca0afe20b53d2d2 // log(1/frcpa(1+204/256))= +5.86713e-001 +data8 0x3fe2dbf557b0df40, 0x3cb646be1188fbc9 // log(1/frcpa(1+205/256))= +5.89350e-001 +data8 0x3fe2ea64c3f97654, 0x3c96516fa8df33b2 // log(1/frcpa(1+206/256))= +5.91113e-001 +data8 0x3fe3001823684d70, 0x3cb96d64e16d1360 // log(1/frcpa(1+207/256))= +5.93762e-001 +data8 0x3fe30e97e9a8b5cc, 0x3c98ef96bc97cca0 // log(1/frcpa(1+208/256))= +5.95531e-001 +data8 0x3fe32463ebdd34e8, 0x3caef1dc9a56c1bf // log(1/frcpa(1+209/256))= +5.98192e-001 +data8 0x3fe332f4314ad794, 0x3caa4f0ac5d5fa11 // log(1/frcpa(1+210/256))= +5.99970e-001 +data8 0x3fe348d90e7464cc, 0x3cbe7889f0516acd // log(1/frcpa(1+211/256))= +6.02643e-001 +data8 0x3fe35779f8c43d6c, 0x3ca96bbab7245411 // log(1/frcpa(1+212/256))= +6.04428e-001 +data8 0x3fe36621961a6a98, 0x3ca31f32262db9fb // log(1/frcpa(1+213/256))= +6.06217e-001 +data8 0x3fe37c299f3c3668, 0x3cb15c72c107ee29 // log(1/frcpa(1+214/256))= +6.08907e-001 +data8 0x3fe38ae2171976e4, 0x3cba42a2554b2dd4 // log(1/frcpa(1+215/256))= +6.10704e-001 +data8 0x3fe399a157a603e4, 0x3cb99c62286d8919 // log(1/frcpa(1+216/256))= +6.12504e-001 +data8 0x3fe3afccfe77b9d0, 0x3ca11048f96a43bd // log(1/frcpa(1+217/256))= +6.15210e-001 +data8 0x3fe3be9d503533b4, 0x3ca4022f47588c3e // log(1/frcpa(1+218/256))= +6.17018e-001 +data8 0x3fe3cd7480b4a8a0, 0x3cb4ba7afc2dc56a // log(1/frcpa(1+219/256))= +6.18830e-001 +data8 0x3fe3e3c43918f76c, 0x3c859673d064b8ba // log(1/frcpa(1+220/256))= +6.21554e-001 +data8 0x3fe3f2acb27ed6c4, 0x3cb55c6b452a16a8 // log(1/frcpa(1+221/256))= +6.23373e-001 +data8 0x3fe4019c2125ca90, 0x3cb8c367879c5a31 // log(1/frcpa(1+222/256))= +6.25197e-001 +data8 0x3fe4181061389720, 0x3cb2c17a79c5cc6c // log(1/frcpa(1+223/256))= +6.27937e-001 +data8 0x3fe42711518df544, 0x3ca5f38d47012fc5 // log(1/frcpa(1+224/256))= +6.29769e-001 +data8 0x3fe436194e12b6bc, 0x3cb9854d65a9b426 // log(1/frcpa(1+225/256))= +6.31604e-001 +data8 0x3fe445285d68ea68, 0x3ca3ff9b3a81cd81 // log(1/frcpa(1+226/256))= +6.33442e-001 +data8 0x3fe45bcc464c8938, 0x3cb0a2d8011a6c05 // log(1/frcpa(1+227/256))= +6.36206e-001 +data8 0x3fe46aed21f117fc, 0x3c8a2be41f8e9f3d // log(1/frcpa(1+228/256))= +6.38053e-001 +data8 0x3fe47a1527e8a2d0, 0x3cba4a83594fab09 // log(1/frcpa(1+229/256))= +6.39903e-001 +data8 0x3fe489445efffcc8, 0x3cbf306a23dcbcde // log(1/frcpa(1+230/256))= +6.41756e-001 +data8 0x3fe4a018bcb69834, 0x3ca46c9285029fd1 // log(1/frcpa(1+231/256))= +6.44543e-001 +data8 0x3fe4af5a0c9d65d4, 0x3cbbc1db897580e3 // log(1/frcpa(1+232/256))= +6.46405e-001 +data8 0x3fe4bea2a5bdbe84, 0x3cb84d880d7ef775 // log(1/frcpa(1+233/256))= +6.48271e-001 +data8 0x3fe4cdf28f10ac44, 0x3cb3ec4b7893ce1f // log(1/frcpa(1+234/256))= +6.50140e-001 +data8 0x3fe4dd49cf994058, 0x3c897224d59d3408 // log(1/frcpa(1+235/256))= +6.52013e-001 +data8 0x3fe4eca86e64a680, 0x3cbccf620f24f0cd // log(1/frcpa(1+236/256))= +6.53889e-001 +data8 0x3fe503c43cd8eb68, 0x3c3f872c65971084 // log(1/frcpa(1+237/256))= +6.56710e-001 +data8 0x3fe513356667fc54, 0x3cb9ca64cc3d52c8 // log(1/frcpa(1+238/256))= +6.58595e-001 +data8 0x3fe522ae0738a3d4, 0x3cbe708164c75968 // log(1/frcpa(1+239/256))= +6.60483e-001 +data8 0x3fe5322e26867854, 0x3cb9988ba4aea615 // log(1/frcpa(1+240/256))= +6.62376e-001 +data8 0x3fe541b5cb979808, 0x3ca1662e3a6b95f5 // log(1/frcpa(1+241/256))= +6.64271e-001 +data8 0x3fe55144fdbcbd60, 0x3cb3acd4ca45c1e0 // log(1/frcpa(1+242/256))= +6.66171e-001 +data8 0x3fe560dbc45153c4, 0x3cb4988947959fed // log(1/frcpa(1+243/256))= +6.68074e-001 +data8 0x3fe5707a26bb8c64, 0x3cb3017fe6607ba9 // log(1/frcpa(1+244/256))= +6.69980e-001 +data8 0x3fe587f60ed5b8fc, 0x3cbe7a3266366ed4 // log(1/frcpa(1+245/256))= +6.72847e-001 +data8 0x3fe597a7977c8f30, 0x3ca1e12b9959a90e // log(1/frcpa(1+246/256))= +6.74763e-001 +data8 0x3fe5a760d634bb88, 0x3cb7c365e53d9602 // log(1/frcpa(1+247/256))= +6.76682e-001 +data8 0x3fe5b721d295f10c, 0x3cb716c2551ccbf0 // log(1/frcpa(1+248/256))= +6.78605e-001 +data8 0x3fe5c6ea94431ef8, 0x3ca02b2ed0e28261 // log(1/frcpa(1+249/256))= +6.80532e-001 +data8 0x3fe5d6bb22ea86f4, 0x3caf43a8bbb2f974 // log(1/frcpa(1+250/256))= +6.82462e-001 +data8 0x3fe5e6938645d38c, 0x3cbcedc98821b333 // log(1/frcpa(1+251/256))= +6.84397e-001 +data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.86335e-001 +data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001 +data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001 +data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001 +ASM_SIZE_DIRECTIVE(pow_Tt) + + +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +pow_tbl1: +ASM_TYPE_DIRECTIVE(pow_tbl1,@object) +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +ASM_SIZE_DIRECTIVE(pow_tbl1) + + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +pow_tbl2: +ASM_TYPE_DIRECTIVE(pow_tbl2,@object) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +ASM_SIZE_DIRECTIVE(pow_tbl2) + +.global pow + +.section .text +.proc pow +.align 32 + +pow: + +{ .mfi + alloc r32=ar.pfs,1,35,4,0 + fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 + mov pow_GR_17ones = 0x1FFFF +} +{ .mfi +(p0) addl pow_AD_P = @ltoff(pow_table_P), gp + fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 + nop.i 999 +;; +} + + +// Get exponent of x. Will be used to calculate K. +{ .mfi + getf.exp pow_GR_signexp_X = f8 + frcpa.s1 POW_B, p6 = f1,f8 + nop.i 999 +} +{ .mfi + ld8 pow_AD_P = [pow_AD_P] + fma.s1 POW_NORM_X = f8,f1,f0 + mov pow_GR_FFF7 = 0xFFF7 +} +;; + + + +// Get significand of x. Will be used to get index to fetch T, Tt. +// p13 = TRUE ==> X is unorm +// DOUBLE 0x10033 exponent limit at which y is an integer +// SINGLE 0x10016 +{ .mfi + getf.sig pow_GR_sig_X = f8 + fclass.m p13,p0 = f8, 0x0b // Test for x unorm + addl pow_GR_10033 = 0x10033, r0 +} +{ .mfi + mov pow_GR_16ones = 0xFFFF + fma.s1 POW_NORM_Y = f9,f1,f0 + nop.i 999 +} +;; + + +// p14 = TRUE ==> X is ZERO +{ .mfi + adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P + fclass.m p14,p15 = f8, 0x07 + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones +} +{ .mfi + adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + ldfe POW_P5 = [pow_AD_P], 16 + fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 + shl pow_GR_offset = pow_GR_sig_X, 1 +} +{ .mib + ldfe POW_P4 = [pow_AD_Q], 16 + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones +(p13) br.cond.spnt L(POW_X_DENORM) +} +;; + + +// Continue normal and denormal paths here +L(POW_COMMON): +// p11 = TRUE ==> Y is a NAN +{ .mfi + ldfe POW_P3 = [pow_AD_P], 16 + fclass.m.unc p11,p0 = f9, 0xc3 + shr.u pow_GR_offset = pow_GR_offset,56 +} +{ .mfi + ldfe POW_P2 = [pow_AD_Q], 16 + nop.f 999 + nop.i 999 +} +;; + + + +// Compute xsq to decide later if |x|=1 +// p11 = TRUE ==> Y is a NaN +{ .mfi + setf.sig POW_int_K = pow_GR_true_exp_X +(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1 + shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt +} +{ .mfi + nop.m 999 +(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 + nop.i 999 +} +;; + + + +// p12 = TRUE ==> X is ZERO and Y is ZERO +{ .mfi + ldfe POW_P1 = [pow_AD_P], 16 +(p14) fclass.m.unc p12,p0 = f9, 0x07 + nop.i 999 +} +{ .mfb + ldfe POW_P0 = [pow_AD_Q], 16 + fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0 +(p11) br.cond.spnt L(POW_Y_NAN) +} +;; + + +.pred.rel "mutex",p8,p9 +// Get exponent of |x|-1 to use in comparison to 2^-8 +{ .mmf +(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1 +(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1 + fcvt.fx.s1 POW_int_Y = POW_NORM_Y +} +;; + + +// p11 = TRUE ==> X is a NAN +{ .mfi + ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16 + fclass.m.unc p11,p0 = f8, 0xc3 + nop.i 999 +} +{ .mib + ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 + nop.i 999 +(p12) br.cond.spnt L(POW_X_0_Y_0) +} +;; + + +// p14 = TRUE ==> X is zero +// p15 = TRUE ==> X is zero AND Y is negative +// p10 = TRUE ==> X is zero AND Y is >= zero +{ .mfi + ldfe POW_inv_log2_by_128 = [pow_AD_P], 16 +(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + nop.f 999 + and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones +} +;; + + +// Determine if we will use the |x| near 1 path (p6) or normal path (p7) +// p12 = TRUE ==> X is a NAN and Y is a zero +// p13 = TRUE ==> X is a NAN and Y is anything else +{ .mfi + getf.exp pow_GR_signexp_Y = POW_NORM_Y +(p11) fclass.m.unc p12,p13 = f9, 0x07 + cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7 +} +{ .mfi + ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 + fma.s1 POW_rsq = POW_r, POW_r,f0 + nop.i 999 +;; +} + +// If on the x near 1 path, assign r1 to r and r1*r1 to rsq +{ .mfi + ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 +(p6) fma.s1 POW_r = POW_r1, f1, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 + nop.i 999 +;; +} + + +{ .mfi + ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 +(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + nop.m 999 +(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 +(p12) br.cond.spnt L(POW_X_NAN_Y_0) +} +;; + + +{ .mfi + nop.m 999 +(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 + andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + nop.m 999 +(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 +(p12) br.cond.spnt L(POW_X_NAN_Y_0) +} +;; + +{ .mfi + nop.m 999 + fcvt.xf POW_K = POW_int_K + nop.i 999 +} +{ .mfb + nop.m 999 +(p13) fma.d f8 = f8,f1,f0 +(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero +} +;; + +// p10 = TRUE ==> X is zero AND Y is positive +// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int) +// return +0 +// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) +{ .mfi +(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 +(p6) fmerge.s POW_delta = f0,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 POW_G = f0,f0,f0 + nop.i 999 +} +;; + +{ .mfi + getf.sig pow_GR_sig_int_Y = POW_int_Y + fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_U = POW_NORM_Y,POW_r,f0 + nop.i 999 +} +;; + +{ .mfi + ldfe POW_log2_by_128_lo = [pow_AD_P], 16 +(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0 + nop.i 999 +} +{ .mfi + ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 +(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fcvt.xf POW_float_int_Y = POW_int_Y + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 + adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q +} +;; + +{ .mfi + nop.m 999 +(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T + adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 +} +;; + + +{ .mfi + nop.m 999 + fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U + nop.i 999 +} +;; + +// p11 = TRUE ==> X is NEGATIVE +// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int) +// return +0 +{ .mfi + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x1a + nop.i 999 +} +{ .mfb + nop.m 999 +(p8) fma.d f8 = f0,f0,f0 +(p8) br.ret.spnt b0 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 + nop.i 999 +} +;; + + +// p11 = TRUE ==> X is NEGATIVE +// p12 = TRUE ==> X is NEGATIVE AND Y already int +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +{ .mfi + nop.m 999 + fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 +(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 +} +{ .mfi + nop.m 999 + fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 + nop.i 999 +} +;; + +// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) +// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) +// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0 +{ .mfi + nop.m 999 +(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_Gpr = POW_G, f1, POW_r + nop.i 999 +} +;; + +// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand +{ .mfi + nop.m 999 + fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF + nop.i 999 +} +{ .mfi + nop.m 999 + fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 + nop.i 999 +} +;; + + +// If x=0 and y>0, test y and flag denormal +// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) +// p8 = TRUE ==> X is zero AND Y is an odd integer +// p9 = TRUE ==> X is zero AND Y is an even integer +{ .mfi + nop.m 999 +(p10) fcmp.eq.s0 p15,p0 = f9,f0 +(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0 +} +{ .mfi + nop.m 999 + fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 + nop.i 999 +} +;; + +// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand +{ .mfi + nop.m 999 + fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_W1 = POW_Z1, POW_inv_log2_by_128, POW_RSHF + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p7) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y not integer + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 +(p8) br.ret.spnt b0 // Exit if x zero and y odd integer +} +;; + +// By subtracting RSHF we get rounded integer POW_N2float +// p15 = TRUE ==> X_0_Y_NEG +{ .mfi + nop.m 999 + fms.s1 POW_N2float = POW_W2, f1, POW_RSHF + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 +(p15) br.cond.spnt L(POW_X_0_Y_NEG) +} +;; + + + +{ .mfi + nop.m 999 + fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 +(p7) br.ret.spnt b0 // Exit if x zero and y not an integer +} +;; + + + +// Extract rounded integer from rightmost significand of POW_W2 +// By subtracting RSHF we get rounded integer POW_N1float +{ .mfi + getf.sig pow_GR_int_W2 = POW_W2 + fms.s1 POW_N1float = POW_W1, f1, POW_RSHF + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half + nop.i 999 +} +;; + + + + +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +// p10 = TRUE ==> X is NEG and Y is an int +// p12 = TRUE ==> X is NEG and Y is not an int +{ .mfi + nop.m 999 +(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y even integer +(p9) br.ret.spnt b0 // Exit if x zero and y even integer +} +;; + + +{ .mfi + nop.m 999 + fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV + nop.i 999 +} +;; + +// Extract rounded integer from rightmost significand of POW_W1 +// Test if x inf +{ .mfi + getf.sig pow_GR_int_W1 = POW_W1 + fclass.m.unc p15,p0 = POW_NORM_X, 0x23 + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1 +(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer +} +;; + +// p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer +{ .mfi + getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr + fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 +(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 +} +;; + + +{ .mfi + add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 + fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1 + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1 +(p15) br.cond.spnt L(POW_X_INF) +} +;; + + +// Test x and y and flag denormal +{ .mfi + and pow_GR_index1 = 0x0f, pow_GR_int_N + fcmp.eq.s0 p15,p0 = f8,f9 + shr r2 = pow_GR_int_N, 7 +} +{ .mfi + and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones + nop.f 999 + and pow_GR_index2 = 0x70, pow_GR_int_N +} +;; + + + +{ .mfi + shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0 + sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones +} +{ .mfi + addl pow_int_GR_M = 0xFFFF, r2 + fma.s1 POW_e12 = POW_e1,f1,POW_e2 + add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 +} +;; + + +{ .mmi + ldfe POW_T1 = [pow_AD_T1],16 + setf.exp POW_2M = pow_int_GR_M + andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones +} +;; + + +{ .mfb + ldfe POW_T2 = [pow_AD_T2],16 + fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 +(p7) br.ret.spnt b0 // Early exit if y=1.0, result is x +} +;; + + +// double: p8 TRUE ==> |Y(G + r)| >= 10 +// single: p8 TRUE ==> |Y(G + r)| >= 7 + +// double +// -2^10 -2^9 2^9 2^10 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | +// single +// -2^7 -2^6 2^6 2^7 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | + + +{ .mfi +(p0) cmp.le.unc p8,p9 = 10, pow_GR_true_exp_Y_Gpr + fma.s1 POW_s = POW_s1, f1, POW_s2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_f12 = POW_f1, POW_f2,f0 + nop.i 999 +} +;; + + +{ .mfi + nop.f 999 +(p9) cmp.le.unc p0,p10 = 9, pow_GR_true_exp_Y_Gpr +} +;; + + + +{ .mfb + nop.m 999 + fma.s1 POW_e123 = POW_e12, f1, POW_e3 +(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF) +} +;; + + +{ .mmf + fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_ssq = POW_s, POW_s, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_1ps = f1,f1,POW_s + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_f3 = POW_e123,f1,f1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_f123 = POW_f12, POW_f3, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_2M, POW_T1T2, f0 + nop.i 999 +} +;; + + + +{ .mfi + nop.m 999 +(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int + nop.i 999 +} +{ .mfi + nop.m 999 +// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_A, POW_f123, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_A, POW_es,f0 + nop.i 999 +} +;; + + + +{ .mfb + nop.m 999 +(p10) fma.d f8 = POW_A, POW_q, POW_A +(p10) br.ret.sptk b0 +} +;; + + + + + +// POSSIBLE_OVER_UNDER +// p6 = TRUE ==> Y negative + +{ .mfi + nop.m 999 + fmerge.s POW_abs_A = f0, POW_A + cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0 +} +;; + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(POW_POSSIBLE_UNDER) +} +;; + +// POSSIBLE_OVER +// We got an answer. +// overflow is a possibility, not a certainty + + +// We define an overflow when the answer with +// WRE set +// user-defined rounding mode + +// double +// Largest double is 7FE (biased double) +// 7FE - 3FF + FFFF = 103FE +// Create + largest_double_plus_ulp +// Create - largest_double_plus_ulp +// Calculate answer with WRE set. + +// single +// Largest single is FE (biased double) +// FE - 7F + FFFF = 1007E +// Create + largest_single_plus_ulp +// Create - largest_single_plus_ulp +// Calculate answer with WRE set. + +// Cases when answer is ldn+1 are as follows: +// ldn ldn+1 +// --+----------|----------+------------ +// | +// +inf +inf -inf +// RN RN +// RZ + + +// Put in s2 (td set, wre set) +{ .mfi + mov pow_GR_gt_ln = 0x103ff + fsetc.s2 0x7F,0x42 + nop.i 999 +} +;; + + +{ .mfi + setf.exp POW_gt_pln = pow_GR_gt_ln + fma.d.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A + nop.i 999 ;; +} + +// Return s2 to default +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + + +// p7 = TRUE ==> yes, we have an overflow +{ .mfi + nop.m 999 + fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln + nop.i 999 +} +;; + + + +{ .mfb +(p7) mov pow_GR_tag = 24 + fma.d f8 = POW_A, POW_q, POW_A +(p7) br.cond.spnt __libm_error_region +} +{ .mfb + nop.m 999 + nop.f 999 +(p0) br.ret.sptk b0 +} +;; + + +L(POW_POSSIBLE_UNDER): +// We got an answer. input was < -2^9 but > -2^10 (double) +// We got an answer. input was < -2^6 but > -2^7 (float) +// underflow is a possibility, not a certainty + +// We define an underflow when the answer with +// ftz set +// is zero (tiny numbers become zero) +// Notice (from below) that if we have an unlimited exponent range, +// then there is an extra machine number E between the largest denormal and +// the smallest normal. +// So if with unbounded exponent we round to E or below, then we are +// tiny and underflow has occurred. +// But notice that you can be in a situation where we are tiny, namely +// rounded to E, but when the exponent is bounded we round to smallest +// normal. So the answer can be the smallest normal with underflow. +// E +// -----+--------------------+--------------------+----- +// | | | +// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe +// 0.1...11 2^-3ffe (biased, 1) +// largest dn smallest normal + + +// Put in s2 (td set, ftz set) +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x41 + nop.i 999 +} +;; + + + +{ .mfi + nop.m 999 + fma.d.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A + nop.i 999 +} +;; + + +// Return s2 to default +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + + +// p7 = TRUE ==> yes, we have an underflow +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0 + nop.i 999 +} +;; + + + + +{ .mfb +(p7) mov pow_GR_tag = 25 + fma.d f8 = POW_A, POW_q, POW_A +(p7) br.cond.spnt __libm_error_region +} +;; + + +{ .mfb + nop.m 999 + nop.f 999 + br.ret.sptk b0 +} +;; + + +L(POW_X_DENORM): +// Here if x unorm. Use the NORM_X for getf instructions, and the back +// to normal path +{ .mfi + getf.exp pow_GR_signexp_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + getf.sig pow_GR_sig_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + nop.f 999 +} +;; + +{ .mib + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones + shl pow_GR_offset = pow_GR_sig_X, 1 + br.cond.sptk L(POW_COMMON) +} +;; + + +L(POW_X_0_Y_0): +// When X is +-0 and Y is +-0, IEEE returns 1.0 +// We call error support with this value + +{ .mfb + mov pow_GR_tag = 26 + fma.d f8 = f1,f1,f0 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_INF): +// When X is +-inf and Y is +-, IEEE returns + +// overflow +// X +inf Y +inf +inf +// X -inf Y +inf +inf + +// X +inf Y >0 +inf +// X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !! +// X -inf Y >0, odd integer -inf + +// underflow +// X +inf Y -inf +0 +// X -inf Y -inf +0 + +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 + +// X + inf Y=+0 +1 +// X + inf Y=-0 +1 +// X - inf Y=+0 +1 +// X - inf Y=-0 +1 + +// p13 == Y negative +// p14 == Y positive + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 + + + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// p13 == (Y negative) +// return (sign_of_x)inf +// p14 == (Y positive) +// return (sign_of_x)0 +// pxx even +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 + +// pxx == Y is not an integer +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 +// + +// If x=inf, test y and flag denormal +{ .mfi + nop.m 999 + fcmp.eq.s0 p10,p11 = f9,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fcmp.lt p13,p14 = POW_NORM_Y,f0 + cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 +} +{ .mfi + nop.m 999 + fclass.m p12,p0 = f9, 0x23 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fclass.m p15,p0 = f9, 0x07 //@zero + nop.i 999 +} +;; + +{ .mfb + nop.m 999 +(p15) fmerge.s f8 = f1,f1 +(p15) br.ret.spnt b0 +} +;; + + +{ .mfi +(p13) mov pow_GR_tag = 25 +(p14) frcpa.s1 f8,p10 = f1,f0 + nop.i 999 +} +{ .mfb +(p14) mov pow_GR_tag = 24 +(p13) fma.s1 f8 = f0,f0,f0 +(p12) br.ret.spnt b0 +} +;; + + + +{ .mfb + nop.m 999 +(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y + nop.b 999 +} +;; + +{ .mfi + nop.m 999 + nop.f 999 +(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 +} +;; + +{ .mfb + nop.m 999 +(p11) fmerge.s f8 = POW_NORM_X,f8 + br.ret.sptk b0 +} +;; + + + +L(POW_X_0_Y_NEG): +// When X is +-0 and Y is negative, IEEE returns +// X Y answer +// +0 -odd int +inf +// -0 -odd int -inf + +// +0 !-odd int +inf +// -0 !-odd int +inf + + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// return +inf + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// return (sign_of_x)inf +// p12 even +// return +inf +// p10 == Y is not an integer +// return +inf +// +// + +{ .mfi + nop.m 999 + nop.f 999 + cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 +} +;; + + +{ .mfi + mov pow_GR_tag = 27 +(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +;; + + +{ .mfb + nop.m 999 +(p6) frcpa.s0 f8,p13 = f1, f0 +(p6) br.cond.sptk __libm_error_region +} +;; + +{ .mfb + nop.m 999 +(p10) frcpa.s0 f8,p13 = f1, f0 +(p10) br.cond.sptk __libm_error_region +} +;; + + + +{ .mib + nop.m 999 +(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 + nop.b 999 +} +;; + + + +{ .mfi + nop.m 999 +(p12) frcpa.s0 f8,p13 = f1,f0 + nop.i 999 +} +;; + +{ .mfb + nop.m 999 +(p11) frcpa f8,p13 = f1,f8 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_NEG_Y_NONINT): +// When X is negative and Y is a non-integer, IEEE +// returns a qnan indefinite. +// We call error support with this value + +{ .mfb + mov pow_GR_tag = 28 + frcpa f8,p6 = f0,f0 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_NAN_Y_0): +// When X is a NAN and Y is zero, IEEE returns 1. +// We call error support with this value. + +{ .mfi + nop.m 0 + fma.d.s0 f10 = f8,f1,f0 + nop.i 0 +} +{ .mfb + mov pow_GR_tag = 29 + fma.d.s0 f8 = f0,f0,f1 + br.cond.sptk __libm_error_region +} +;; + + +L(POW_OVER_UNDER_X_NOT_INF): + +// p8 is TRUE for overflow +// p9 is TRUE for underflow + +// if y is infinity, we should not over/underflow + + +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1 + cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0 +} +;; + +{ .mfi + nop.m 999 +(p14) fclass.m.unc p15, p0 = f9, 0x23 + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fclass.m.unc p11,p0 = f9, 0x23 + nop.i 999 +} +;; + +// p15 = TRUE if |x|=1, y=inf, return +1 +{ .mfb + nop.m 999 +(p15) fma.d f8 = f1,f1,f0 +(p15) br.ret.spnt b0 +} +;; + +.pred.rel "mutex",p8,p9 +{ .mfb +(p8) setf.exp f8 = pow_GR_17ones +(p9) fmerge.s f8 = f0,f0 +(p11) br.ret.sptk b0 +} + +{ .mfb + nop.m 999 + nop.f 999 + br.cond.sptk L(POW_OVER_UNDER_ERROR) +} +;; + +L(POW_Y_NAN): + +// Is x = +1 then result is +1, else result is quiet Y +{ .mfi + nop.m 999 + fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p10) fcmp.eq.s0 p6,p0 = f9,f1 // Set invalid, even if x=+1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p10) fma.d f8 = f1,f1,f0 + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.d f8 = f9,f8,f0 + br.ret.sptk b0 +} +;; + + +L(POW_OVER_UNDER_ERROR): + +{ .mfi + nop.m 999 + fmerge.s f10 = POW_NORM_X,POW_NORM_X + nop.i 999 +} +{ .mfi + sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 + nop.f 999 + mov pow_GR_one = 0x1 +} +;; + +// overflow +{ .mmb +(p8) mov pow_GR_tag = 24 +(p8) setf.exp f11 = pow_GR_17ones_m1 + nop.b 999 +} +;; + + +// underflow +{ .mmi +(p9) mov pow_GR_tag = 25 +(p9) setf.exp f11 = pow_GR_one + nop.i 999 +} +;; + + +// p12 x is negative and y is an odd integer + + +{ .mfi + nop.m 999 + fma.d f8 = f11, f11, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p12) fmerge.ns f8 = f8, f8 + nop.i 999 +} +;; + + +.endp pow +ASM_SIZE_DIRECTIVE(pow) + + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + + +.proc __libm_error_region +__libm_error_region: + +// Answer is inf for overflow and 0 for underflow. +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_powf.S b/sysdeps/ia64/fpu/e_powf.S new file mode 100644 index 0000000000..1c0ebd8114 --- /dev/null +++ b/sysdeps/ia64/fpu/e_powf.S @@ -0,0 +1,2309 @@ +.file "powf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/03/00 Added p12 to definite over/under path. With odd power we did not +// maintain the sign of x in this path. +// 4/04/00 Unwind support added +// 4/19/00 pow(+-1,inf) now returns NaN +// pow(+-val, +-inf) returns 0 or inf, but now does not call error support +// Added s1 to fcvt.fx because invalid flag was incorrectly set. +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 9/07/00 Improved performance by eliminating bank conflicts and other stalls, +// and tweaking the critical path +// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 +// 9/28/00 Updated NaN**0 path +// 1/20/01 Fixed denormal flag settings. +// 2/12/01 Improved speed. +// +// API +//============================================================== +// double pow(double) +// float powf(float) +// +// Overview of operation +//============================================================== +// +// Three steps... +// 1. Log(x) +// 2. y Log(x) +// 3. exp(y log(x)) +// +// This means we work with the absolute value of x and merge in the sign later. +// Log(x) = G + delta + r -rsq/2 + p +// G,delta depend on the exponent of x and table entries. The table entries are +// indexed by the exponent of x, called K. +// +// The G and delta come out of the reduction; r is the reduced x. +// +// B = frcpa(x) +// xB-1 is small means that B is the approximate inverse of x. +// +// Log(x) = Log( (1/B)(Bx) ) +// = Log(1/B) + Log(Bx) +// = Log(1/B) + Log( 1 + (Bx-1)) +// +// x = 2^K 1.x_1x_2.....x_52 +// B= frcpa(x) = 2^-k Cm +// Log(1/B) = Log(1/(2^-K Cm)) +// Log(1/B) = Log((2^K/ Cm)) +// Log(1/B) = K Log(2) + Log(1/Cm) +// +// Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1)) +// +// If you take the significand of x, set the exponent to true 0, then Cm is +// the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them. +// The frcpa table is indexed by 8 bits, the x_1 thru x_8. +// m = x_1x_2...x_8 is an 8-bit index. +// +// Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255. +// +// We tabluate as two doubles, T and t, where T +t is the value itself. +// +// Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1)) +// Log(x) = G + delta + Log( 1 + (Bx-1)) +// +// The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1. +// +// Log( 1 + (Bx-1)) = r - rsq/2 + p +// +// Then, +// +// yLog(x) = yG + y delta + y(r-rsq/2) + yp +// yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3) +// +// +// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) +// +// +// exp(Z3) is another series. +// exp(e1 + e2 + e3) is approximated as f3 = 1 + (e1 + e2 + e3) +// +// Z1 (128/log2) = number of log2/128 in Z1 is N1 +// Z2 (128/log2) = number of log2/128 in Z2 is N2 +// +// s1 = Z1 - N1 log2/128 +// s2 = Z2 - N2 log2/128 +// +// s = s1 + s2 +// N = N1 + N2 +// +// exp(Z1 + Z2) = exp(Z) +// exp(Z) = exp(s) exp(N log2/128) +// +// exp(r) = exp(Z - N log2/128) +// +// r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo +// = Z - N (log2/128) +// +// Z = s+d +N (log2/128) +// +// exp(Z) = exp(s) (1+d) exp(N log2/128) +// +// N = M 128 + n +// +// N log2/128 = M log2 + n log2/128 +// +// n is 8 binary digits = n_7n_6...n_1 +// +// n log2/128 = n_7n_6n_5 16 log2/128 + n_4n_3n_2n_1 log2/128 +// n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128 +// n log2/128 = I2 log2/8 + I1 log2/128 +// +// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 +// +// exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128)) +// exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128 +// exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128 +// +// I1, I2 are table indices. Use a series for exp(s). +// Then get exp(Z) +// +// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) +// exp(yLog(x)) = exp(Z) exp(Z3) f3 +// exp(yLog(x)) = exp(Z)f3 exp(Z3) +// exp(yLog(x)) = A exp(Z3) +// +// We actually calculate exp(Z3) -1. +// Then, +// exp(yLog(x)) = A + A( exp(Z3) -1) +// + +// Table Generation +//============================================================== + +// The log values +// ============== +// The operation (K*log2_hi) must be exact. K is the true exponent of x. +// If we allow gradual underflow (denormals), K can be represented in 12 bits +// (as a two's complement number). We assume 13 bits as an engineering precaution. +// +// +------------+----------------+-+ +// | 13 bits | 50 bits | | +// +------------+----------------+-+ +// 0 1 66 +// 2 34 +// +// So we want the lsb(log2_hi) to be 2^-50 +// We get log2 as a quad-extended (15-bit exponent, 128-bit significand) +// +// 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...) +// +// Consider numbering the bits left to right, starting at 0 thru 127. +// Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit. +// +// ...79ab +// 0111 1001 1010 1011 +// 44 +// 89 +// +// So if we shift off the rightmost 14 bits, then (shift back only +// the top half) we get +// +// 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000 +// +// Put the right 64-bit signficand in an FR register, convert to double; +// it is exact. Put the next 128 bits into a quad register and round to double. +// The true exponent of the low part is -51. +// +// hi is 0 fffe b17217f7d1cf4000 +// lo is 0 ffcc e6af278ece601000 +// +// Convert to double memory format and get +// +// hi is 0x3fe62e42fefa39e8 +// lo is 0x3cccd5e4f1d9cc02 +// +// log2_hi + log2_lo is an accurate value for log2. +// +// +// The T and t values +// ================== +// A similar method is used to generate the T and t values. +// +// K * log2_hi + T must be exact. +// +// Smallest T,t +// ---------- +// The smallest T,t is +// T t +// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 +// +// The exponent is 0x3f6 (biased) or -9 (true). +// For the smallest T value, what we want is to clip the significand such that +// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific +// for the first entry. In general, it is 0xffff - (biased 15-bit exponent). + +// Independently, what we have calculated is the table value as a quad precision number. +// Table entry 1 is +// 0 fff6 80200aaeac44ef38 338f77605fdf8000 +// +// We store this quad precision number in a data structure that is +// sign: 1 +// exponent: 15 +// signficand_hi: 64 (includes explicit bit) +// signficand_lo: 49 +// Because the explicit bit is included, the significand is 113 bits. +// +// Consider significand_hi for table entry 1. +// +// +// +-+--- ... -------+--------------------+ +// | | +// +-+--- ... -------+--------------------+ +// 0 1 4444444455555555556666 +// 2345678901234567890123 +// +// Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc. +// Bit 42 is 2^-42. If we shift to the right by 9, the bit in +// bit 42 goes in 51. +// +// So what we want to do is shift bits 43 thru 63 into significand_lo. +// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits. +// Then shifting (just with signficaand_hi) back into bit 42. +// +// The shift_value is 63-42 = 21. In general, this is +// 63 - (51 -(0xffff - 0xfff6)) +// For this example, it is +// 63 - (51 - 9) = 63 - 42 = 21 +// +// This means we are shifting 21 bits into significand_lo. We must maintain more +// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit +// significand into a 256-bit signficand and then shift. +// The 256-bit significand has four parts: hh, hl, lh, and ll. +// +// Start off with +// hh hl lh ll +// <64> <49><15_0> <64_0> <64_0> +// +// After shift by 21 (then return for significand_hi), +// <43><21_0> <21><43> <6><58_0> <64_0> +// +// Take the hh part and convert to a double. There is no rounding here. +// The conversion is exact. The true exponent of the high part is the same as the +// true exponent of the input quad. +// +// We have some 64 plus significand bits for the low part. In this example, we have +// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm. +// For this example the true exponent of the low part is +// true_exponent_of_high - 43 = true_exponent_of_high - (64-21) +// In general, this is +// true_exponent_of_high - (64 - shift_value) +// +// +// Largest T,t +// ---------- +// The largest T,t is +// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001 +// +// Table entry 256 is +// 0 fffe b1321ff67cba178c 51da12f4df5a0000 +// +// The shift value is +// 63 - (51 -(0xffff - 0xfffe)) = 13 +// +// The true exponent of the low part is +// true_exponent_of_high - (64 - shift_value) +// -1 - (64-13) = -52 +// Biased as a double, this is 0x3cb +// +// +// +// So then lsb(T) must be >= 2^-51 +// msb(Klog2_hi) <= 2^12 +// +// +--------+---------+ +// | 51 bits | <== largest T +// +--------+---------+ +// | 9 bits | 42 bits | <== smallest T +// +------------+----------------+-+ +// | 13 bits | 50 bits | | +// +------------+----------------+-+ + + + +// Special Cases +//============================================================== + +// double float +// overflow error 24 30 + +// underflow error 25 31 + +// X zero Y zero +// +0 +0 +1 error 26 32 +// -0 +0 +1 error 26 32 +// +0 -0 +1 error 26 32 +// -0 -0 +1 error 26 32 + +// X zero Y negative +// +0 -odd integer +inf error 27 33 divide-by-zero +// -0 -odd integer -inf error 27 33 divide-by-zero +// +0 !-odd integer +inf error 27 33 divide-by-zero +// -0 !-odd integer +inf error 27 33 divide-by-zero +// +0 -inf +inf error 27 33 divide-by-zero +// -0 -inf +inf error 27 33 divide-by-zero + +// X zero Y positve +// +0 +odd integer +0 +// -0 +odd integer -0 +// +0 !+odd integer +0 +// -0 !+odd integer +0 +// +0 +inf +0 +// -0 +inf +0 +// +0 Y NaN quiet Y invalid if Y SNaN +// -0 Y NaN quiet Y invalid if Y SNaN + +// X one +// -1 Y inf +1 +// -1 Y NaN quiet Y invalid if Y SNaN +// +1 Y NaN +1 invalid if Y SNaN +// +1 Y any else +1 + +// X - Y not integer QNAN error 28 34 invalid + +// X NaN Y 0 +1 error 29 35 +// X NaN Y NaN quiet X invalid if X or Y SNaN +// X NaN Y any else quiet X invalid if X SNaN +// X !+1 Y NaN quiet Y invalid if Y SNaN + + +// X +inf Y >0 +inf +// X -inf Y >0, !odd integer +inf +// X -inf Y >0, odd integer -inf + +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 + +// X +inf Y =0 +1 +// X -inf Y =0 +1 + +// |X|<1 Y +inf +0 +// |X|<1 Y -inf +inf +// |X|>1 Y +inf +inf +// |X|>1 Y -inf +0 + +// X any Y =0 +1 + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +// integer registers used + +pow_AD_Tt = r33 +pow_GR_FFF7 = r34 +pow_GR_exp_Y = r34 // duplicate +pow_GR_17ones = r35 + +pow_AD_P = r36 +pow_AD_Q = r37 +pow_AD_tbl1 = r38 +pow_AD_tbl2 = r39 +pow_GR_exp_X = r40 +pow_GR_true_exp_X = r40 // duplicate + +pow_GR_offset = r41 +pow_GR_exp_Xm1 = r42 +pow_GR_sig_X = r43 +pow_GR_signexp_X = r44 + +pow_GR_signexp_Xm1 = r46 +pow_GR_int_W1 = r47 +pow_GR_int_W2 = r48 +pow_GR_int_N = r49 +pow_GR_index1 = r50 + +pow_GR_index2 = r51 +pow_AD_T1 = r52 +pow_AD_T2 = r53 +pow_GR_gt_ln = r53 // duplicate +pow_int_GR_M = r54 +pow_GR_10033 = r55 + +pow_GR_16ones = r56 +pow_GR_sig_int_Y = r57 +pow_GR_sign_Y_Gpr = r58 +pow_GR_17ones_m1 = r59 +pow_GR_one = r60 +pow_GR_sign_Y = r60 + +pow_GR_signexp_Y_Gpr = r61 +pow_GR_exp_Y_Gpr = r62 +pow_GR_true_exp_Y_Gpr = r63 +pow_GR_signexp_Y = r64 + +GR_SAVE_B0 = r65 +GR_SAVE_GP = r66 +GR_SAVE_PFS = r67 + +GR_Parameter_X = r68 +GR_Parameter_Y = r69 +GR_Parameter_RESULT = r70 +pow_GR_tag = r71 + + +// floating point registers used + +POW_B = f32 +POW_NORM_X = f33 +POW_Xm1 = f34 +POW_r1 = f34 +POW_P4 = f35 + +POW_P5 = f36 +POW_NORM_Y = f37 +POW_Q2 = f38 +POW_Q3 = f39 +POW_P2 = f40 + +POW_P3 = f41 +POW_P0 = f42 +POW_log2_lo = f43 +POW_r = f44 +POW_Q0_half = f45 + +POW_Q1 = f46 +POW_log2_hi = f48 +POW_Q4 = f49 +POW_P1 = f50 + +POW_log2_by_128_hi = f51 +POW_inv_log2_by_128 = f52 +POW_rsq = f53 +POW_Yrcub = f54 +POW_log2_by_128_lo = f55 + +POW_v6 = f56 +POW_v4 = f58 +POW_v2 = f59 +POW_T = f60 + +POW_Tt = f61 +POW_RSHF = f62 +POW_v21ps = f63 +POW_s4 = f64 + +POW_U = f66 +POW_G = f67 +POW_delta = f68 +POW_v3 = f69 +POW_V = f70 + +POW_p = f71 +POW_Z1 = f72 +POW_e3 = f73 +POW_e2 = f74 +POW_Z2 = f75 + +POW_e1 = f76 +POW_W1 = f77 +POW_UmZ2 = f78 +POW_W2 = f79 +POW_Z3 = f80 + +POW_int_W1 = f81 +POW_e12 = f82 +POW_int_W2 = f83 +POW_UmZ2pV = f84 +POW_Z3sq = f85 + +POW_e123 = f86 +POW_N1float = f87 +POW_N2float = f88 +POW_f3 = f89 +POW_q = f90 + +POW_s1 = f91 +POW_Nfloat = f92 +POW_s2 = f93 +POW_f2 = f94 +POW_f1 = f95 + +POW_T1 = f96 +POW_T2 = f97 +POW_2M = f98 +POW_s = f99 +POW_f12 = f100 + +POW_ssq = f101 +POW_T1T2 = f102 +POW_1ps = f103 +POW_A = f104 +POW_es = f105 + +POW_int_K = f107 +POW_K = f108 +POW_f123 = f109 +POW_Gpr = f110 + +POW_Y_Gpr = f111 +POW_int_Y = f112 + +POW_float_int_Y = f116 +POW_ftz_urm_f8 = f117 +POW_wre_urm_f8 = f118 +POW_abs_A = f119 +POW_gt_pln = f120 + +POW_xsq = f121 + +POW_twoV = f122 +POW_Xp1 = f123 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +pow_table_P: +ASM_TYPE_DIRECTIVE(pow_table_P,@object) +data8 0x8000F7B249FF332D, 0x0000BFFC // P_5 +data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3 +data8 0x80000000000018E5, 0x0000BFFD // P_1 +data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128 + + +data8 0x3FA5555555554A9E // Q_2 +data8 0x3F8111124F4DD9F9 // Q_3 +data8 0x3FE0000000000000 // Q_0 +data8 0x3FC5555555554733 // Q_1 +data8 0x3F56C16D9360FFA0 // Q_4 +data8 0x43e8000000000000 // Right shift constant for exp +data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo +data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q +data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q +ASM_SIZE_DIRECTIVE(pow_table_P) + +pow_table_Q: +ASM_TYPE_DIRECTIVE(pow_table_Q,@object) +data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4 +data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2 +data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0 +data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001 +data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi +ASM_SIZE_DIRECTIVE(pow_table_Q) + + +pow_Tt: +ASM_TYPE_DIRECTIVE(pow_Tt,@object) +data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003 +data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003 +data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003 +data8 0x3f8c317384c75f00, 0x3c69806208c04c22 // log(1/frcpa(1+3/256))= +1.37662e-002 +data8 0x3f91a6b91ac73380, 0x3c7874daa716eb32 // log(1/frcpa(1+4/256))= +1.72376e-002 +data8 0x3f95ba9a5d9ac000, 0x3cacbb84e08d78ac // log(1/frcpa(1+5/256))= +2.12196e-002 +data8 0x3f99d2a807432580, 0x3cbcf80538b441e1 // log(1/frcpa(1+6/256))= +2.52177e-002 +data8 0x3f9d6b2725979800, 0x3c6095e5c8f8f359 // log(1/frcpa(1+7/256))= +2.87291e-002 +data8 0x3fa0c58fa19dfa80, 0x3cb4c5d4e9d0dda2 // log(1/frcpa(1+8/256))= +3.27573e-002 +data8 0x3fa2954c78cbce00, 0x3caa932b860ab8d6 // log(1/frcpa(1+9/256))= +3.62953e-002 +data8 0x3fa4a94d2da96c40, 0x3ca670452b76bbd5 // log(1/frcpa(1+10/256))= +4.03542e-002 +data8 0x3fa67c94f2d4bb40, 0x3ca84104f9941798 // log(1/frcpa(1+11/256))= +4.39192e-002 +data8 0x3fa85188b630f040, 0x3cb40a882cbf0153 // log(1/frcpa(1+12/256))= +4.74971e-002 +data8 0x3faa6b8abe73af40, 0x3c988d46e25c9059 // log(1/frcpa(1+13/256))= +5.16017e-002 +data8 0x3fac441e06f72a80, 0x3cae3e930a1a2a96 // log(1/frcpa(1+14/256))= +5.52072e-002 +data8 0x3fae1e6713606d00, 0x3c8a796f6283b580 // log(1/frcpa(1+15/256))= +5.88257e-002 +data8 0x3faffa6911ab9300, 0x3c5193070351e88a // log(1/frcpa(1+16/256))= +6.24574e-002 +data8 0x3fb0ec139c5da600, 0x3c623f2a75eb992d // log(1/frcpa(1+17/256))= +6.61022e-002 +data8 0x3fb1dbd2643d1900, 0x3ca649b2ef8927f0 // log(1/frcpa(1+18/256))= +6.97605e-002 +data8 0x3fb2cc7284fe5f00, 0x3cbc5e86599513e2 // log(1/frcpa(1+19/256))= +7.34321e-002 +data8 0x3fb3bdf5a7d1ee60, 0x3c90bd4bb69dada3 // log(1/frcpa(1+20/256))= +7.71173e-002 +data8 0x3fb4b05d7aa012e0, 0x3c54e377c9b8a54f // log(1/frcpa(1+21/256))= +8.08161e-002 +data8 0x3fb580db7ceb5700, 0x3c7fdb2f98354cde // log(1/frcpa(1+22/256))= +8.39975e-002 +data8 0x3fb674f089365a60, 0x3cb9994c9d3301c1 // log(1/frcpa(1+23/256))= +8.77219e-002 +data8 0x3fb769ef2c6b5680, 0x3caaec639db52a79 // log(1/frcpa(1+24/256))= +9.14602e-002 +data8 0x3fb85fd927506a40, 0x3c9f9f99a3cf8e25 // log(1/frcpa(1+25/256))= +9.52125e-002 +data8 0x3fb9335e5d594980, 0x3ca15c3abd47d99a // log(1/frcpa(1+26/256))= +9.84401e-002 +data8 0x3fba2b0220c8e5e0, 0x3cb4ca639adf6fc3 // log(1/frcpa(1+27/256))= +1.02219e-001 +data8 0x3fbb0004ac1a86a0, 0x3ca7cb81bf959a59 // log(1/frcpa(1+28/256))= +1.05469e-001 +data8 0x3fbbf968769fca00, 0x3cb0c646c121418e // log(1/frcpa(1+29/256))= +1.09274e-001 +data8 0x3fbccfedbfee13a0, 0x3ca0465fce24ab4b // log(1/frcpa(1+30/256))= +1.12548e-001 +data8 0x3fbda727638446a0, 0x3c82803f4e2e6603 // log(1/frcpa(1+31/256))= +1.15832e-001 +data8 0x3fbea3257fe10f60, 0x3cb986a3f2313d1a // log(1/frcpa(1+32/256))= +1.19677e-001 +data8 0x3fbf7be9fedbfde0, 0x3c97d16a6a621cf4 // log(1/frcpa(1+33/256))= +1.22985e-001 +data8 0x3fc02ab352ff25f0, 0x3c9cc6baad365600 // log(1/frcpa(1+34/256))= +1.26303e-001 +data8 0x3fc097ce579d2040, 0x3cb9ba16d329440b // log(1/frcpa(1+35/256))= +1.29633e-001 +data8 0x3fc1178e8227e470, 0x3cb7bc671683f8e6 // log(1/frcpa(1+36/256))= +1.33531e-001 +data8 0x3fc185747dbecf30, 0x3c9d1116f66d2345 // log(1/frcpa(1+37/256))= +1.36885e-001 +data8 0x3fc1f3b925f25d40, 0x3c8162c9ef939ac6 // log(1/frcpa(1+38/256))= +1.40250e-001 +data8 0x3fc2625d1e6ddf50, 0x3caad3a1ec384fc3 // log(1/frcpa(1+39/256))= +1.43627e-001 +data8 0x3fc2d1610c868130, 0x3cb3ad997036941b // log(1/frcpa(1+40/256))= +1.47015e-001 +data8 0x3fc340c597411420, 0x3cbc2308262c7998 // log(1/frcpa(1+41/256))= +1.50414e-001 +data8 0x3fc3b08b6757f2a0, 0x3cb2170d6cdf0526 // log(1/frcpa(1+42/256))= +1.53825e-001 +data8 0x3fc40dfb08378000, 0x3c9bb453c4f7b685 // log(1/frcpa(1+43/256))= +1.56677e-001 +data8 0x3fc47e74e8ca5f70, 0x3cb836a48fdfce9d // log(1/frcpa(1+44/256))= +1.60109e-001 +data8 0x3fc4ef51f6466de0, 0x3ca07a43919aa64b // log(1/frcpa(1+45/256))= +1.63553e-001 +data8 0x3fc56092e02ba510, 0x3ca85006899d97b0 // log(1/frcpa(1+46/256))= +1.67010e-001 +data8 0x3fc5d23857cd74d0, 0x3ca30a5ba6e7abbe // log(1/frcpa(1+47/256))= +1.70478e-001 +data8 0x3fc6313a37335d70, 0x3ca905586f0ac97e // log(1/frcpa(1+48/256))= +1.73377e-001 +data8 0x3fc6a399dabbd380, 0x3c9b2c6657a96684 // log(1/frcpa(1+49/256))= +1.76868e-001 +data8 0x3fc70337dd3ce410, 0x3cb50bc52f55cdd8 // log(1/frcpa(1+50/256))= +1.79786e-001 +data8 0x3fc77654128f6120, 0x3cad2eb7c9a39efe // log(1/frcpa(1+51/256))= +1.83299e-001 +data8 0x3fc7e9d82a0b0220, 0x3cba127e90393c01 // log(1/frcpa(1+52/256))= +1.86824e-001 +data8 0x3fc84a6b759f5120, 0x3cbd7fd52079f706 // log(1/frcpa(1+53/256))= +1.89771e-001 +data8 0x3fc8ab47d5f5a300, 0x3cbfae141751a3de // log(1/frcpa(1+54/256))= +1.92727e-001 +data8 0x3fc91fe490965810, 0x3cb69cf30a1c319e // log(1/frcpa(1+55/256))= +1.96286e-001 +data8 0x3fc981634011aa70, 0x3ca5bb3d208bc42a // log(1/frcpa(1+56/256))= +1.99261e-001 +data8 0x3fc9f6c407089660, 0x3ca04d68658179a0 // log(1/frcpa(1+57/256))= +2.02843e-001 +data8 0x3fca58e729348f40, 0x3c99f5411546c286 // log(1/frcpa(1+58/256))= +2.05838e-001 +data8 0x3fcabb55c31693a0, 0x3cb9a5350eb327d5 // log(1/frcpa(1+59/256))= +2.08842e-001 +data8 0x3fcb1e104919efd0, 0x3c18965fcce7c406 // log(1/frcpa(1+60/256))= +2.11855e-001 +data8 0x3fcb94ee93e367c0, 0x3cb503716da45184 // log(1/frcpa(1+61/256))= +2.15483e-001 +data8 0x3fcbf851c0675550, 0x3cbdf1b3f7ab5378 // log(1/frcpa(1+62/256))= +2.18516e-001 +data8 0x3fcc5c0254bf23a0, 0x3ca7aab9ed0b1d7b // log(1/frcpa(1+63/256))= +2.21558e-001 +data8 0x3fccc000c9db3c50, 0x3c92a7a2a850072a // log(1/frcpa(1+64/256))= +2.24609e-001 +data8 0x3fcd244d99c85670, 0x3c9f6019120edf4c // log(1/frcpa(1+65/256))= +2.27670e-001 +data8 0x3fcd88e93fb2f450, 0x3c6affb96815e081 // log(1/frcpa(1+66/256))= +2.30741e-001 +data8 0x3fcdedd437eaef00, 0x3c72553595897976 // log(1/frcpa(1+67/256))= +2.33820e-001 +data8 0x3fce530effe71010, 0x3c90913b020fa182 // log(1/frcpa(1+68/256))= +2.36910e-001 +data8 0x3fceb89a1648b970, 0x3c837ba4045bfd25 // log(1/frcpa(1+69/256))= +2.40009e-001 +data8 0x3fcf1e75fadf9bd0, 0x3cbcea6d13e0498d // log(1/frcpa(1+70/256))= +2.43117e-001 +data8 0x3fcf84a32ead7c30, 0x3ca5e3a67b3c6d77 // log(1/frcpa(1+71/256))= +2.46235e-001 +data8 0x3fcfeb2233ea07c0, 0x3cba0c6f0049c5a6 // log(1/frcpa(1+72/256))= +2.49363e-001 +data8 0x3fd028f9c7035c18, 0x3cb0a30b06677ff6 // log(1/frcpa(1+73/256))= +2.52501e-001 +data8 0x3fd05c8be0d96358, 0x3ca0f1c77ccb5865 // log(1/frcpa(1+74/256))= +2.55649e-001 +data8 0x3fd085eb8f8ae790, 0x3cbd513f45fe7a97 // log(1/frcpa(1+75/256))= +2.58174e-001 +data8 0x3fd0b9c8e32d1910, 0x3c927449047ca006 // log(1/frcpa(1+76/256))= +2.61339e-001 +data8 0x3fd0edd060b78080, 0x3c89b52d8435f53e // log(1/frcpa(1+77/256))= +2.64515e-001 +data8 0x3fd122024cf00638, 0x3cbdd976fabda4bd // log(1/frcpa(1+78/256))= +2.67701e-001 +data8 0x3fd14be2927aecd0, 0x3cb02f90ad0bc471 // log(1/frcpa(1+79/256))= +2.70257e-001 +data8 0x3fd180618ef18ad8, 0x3cbd003792c71a98 // log(1/frcpa(1+80/256))= +2.73461e-001 +data8 0x3fd1b50bbe2fc638, 0x3ca9ae64c6403ead // log(1/frcpa(1+81/256))= +2.76675e-001 +data8 0x3fd1df4cc7cf2428, 0x3cb43f0455f7e395 // log(1/frcpa(1+82/256))= +2.79254e-001 +data8 0x3fd214456d0eb8d0, 0x3cb0fbd748d75d30 // log(1/frcpa(1+83/256))= +2.82487e-001 +data8 0x3fd23ec5991eba48, 0x3c906edd746b77e2 // log(1/frcpa(1+84/256))= +2.85081e-001 +data8 0x3fd2740d9f870af8, 0x3ca9802e6a00a670 // log(1/frcpa(1+85/256))= +2.88333e-001 +data8 0x3fd29ecdabcdfa00, 0x3cacecef70890cfa // log(1/frcpa(1+86/256))= +2.90943e-001 +data8 0x3fd2d46602adcce8, 0x3cb97911955f3521 // log(1/frcpa(1+87/256))= +2.94214e-001 +data8 0x3fd2ff66b04ea9d0, 0x3cb12dabe191d1c9 // log(1/frcpa(1+88/256))= +2.96838e-001 +data8 0x3fd335504b355a30, 0x3cbdf9139df924ec // log(1/frcpa(1+89/256))= +3.00129e-001 +data8 0x3fd360925ec44f58, 0x3cb253e68977a1e3 // log(1/frcpa(1+90/256))= +3.02769e-001 +data8 0x3fd38bf1c3337e70, 0x3cb3d283d2a2da21 // log(1/frcpa(1+91/256))= +3.05417e-001 +data8 0x3fd3c25277333180, 0x3cadaa5b035eae27 // log(1/frcpa(1+92/256))= +3.08735e-001 +data8 0x3fd3edf463c16838, 0x3cb983d680d3c108 // log(1/frcpa(1+93/256))= +3.11399e-001 +data8 0x3fd419b423d5e8c0, 0x3cbc86dd921c139d // log(1/frcpa(1+94/256))= +3.14069e-001 +data8 0x3fd44591e0539f48, 0x3c86a76d6dc2782e // log(1/frcpa(1+95/256))= +3.16746e-001 +data8 0x3fd47c9175b6f0a8, 0x3cb59a2e013c6b5f // log(1/frcpa(1+96/256))= +3.20103e-001 +data8 0x3fd4a8b341552b08, 0x3c93f1e86e468694 // log(1/frcpa(1+97/256))= +3.22797e-001 +data8 0x3fd4d4f390890198, 0x3cbf5e4ea7c5105a // log(1/frcpa(1+98/256))= +3.25498e-001 +data8 0x3fd501528da1f960, 0x3cbf58da53e9ad10 // log(1/frcpa(1+99/256))= +3.28206e-001 +data8 0x3fd52dd06347d4f0, 0x3cb98a28cebf6eef // log(1/frcpa(1+100/256))= +3.30921e-001 +data8 0x3fd55a6d3c7b8a88, 0x3c9c76b67c2d1fd4 // log(1/frcpa(1+101/256))= +3.33644e-001 +data8 0x3fd5925d2b112a58, 0x3c9029616a4331b8 // log(1/frcpa(1+102/256))= +3.37058e-001 +data8 0x3fd5bf406b543db0, 0x3c9fb8292ecfc820 // log(1/frcpa(1+103/256))= +3.39798e-001 +data8 0x3fd5ec433d5c35a8, 0x3cb71a1229d17eec // log(1/frcpa(1+104/256))= +3.42545e-001 +data8 0x3fd61965cdb02c18, 0x3cbba94fe1dbb8d2 // log(1/frcpa(1+105/256))= +3.45300e-001 +data8 0x3fd646a84935b2a0, 0x3c9ee496d2c9ae57 // log(1/frcpa(1+106/256))= +3.48063e-001 +data8 0x3fd6740add31de90, 0x3cb1da3a6c7a9dfd // log(1/frcpa(1+107/256))= +3.50833e-001 +data8 0x3fd6a18db74a58c0, 0x3cb494c257add8dc // log(1/frcpa(1+108/256))= +3.53610e-001 +data8 0x3fd6cf31058670e8, 0x3cb0b244a70a8da9 // log(1/frcpa(1+109/256))= +3.56396e-001 +data8 0x3fd6f180e852f0b8, 0x3c9db7aefa866720 // log(1/frcpa(1+110/256))= +3.58490e-001 +data8 0x3fd71f5d71b894e8, 0x3cbe91c4bf324957 // log(1/frcpa(1+111/256))= +3.61289e-001 +data8 0x3fd74d5aefd66d58, 0x3cb06b3d9bfac023 // log(1/frcpa(1+112/256))= +3.64096e-001 +data8 0x3fd77b79922bd378, 0x3cb727d8804491f4 // log(1/frcpa(1+113/256))= +3.66911e-001 +data8 0x3fd7a9b9889f19e0, 0x3ca2ef22df5bc543 // log(1/frcpa(1+114/256))= +3.69734e-001 +data8 0x3fd7d81b037eb6a0, 0x3cb8fd3ba07a7ece // log(1/frcpa(1+115/256))= +3.72565e-001 +data8 0x3fd8069e33827230, 0x3c8bd1e25866e61a // log(1/frcpa(1+116/256))= +3.75404e-001 +data8 0x3fd82996d3ef8bc8, 0x3ca5aab9f5928928 // log(1/frcpa(1+117/256))= +3.77538e-001 +data8 0x3fd85855776dcbf8, 0x3ca56f33337789d6 // log(1/frcpa(1+118/256))= +3.80391e-001 +data8 0x3fd8873658327cc8, 0x3cbb8ef0401db49d // log(1/frcpa(1+119/256))= +3.83253e-001 +data8 0x3fd8aa75973ab8c8, 0x3cbb9961f509a680 // log(1/frcpa(1+120/256))= +3.85404e-001 +data8 0x3fd8d992dc8824e0, 0x3cb220512a53732d // log(1/frcpa(1+121/256))= +3.88280e-001 +data8 0x3fd908d2ea7d9510, 0x3c985f0e513bfb5c // log(1/frcpa(1+122/256))= +3.91164e-001 +data8 0x3fd92c59e79c0e50, 0x3cb82e073fd30d63 // log(1/frcpa(1+123/256))= +3.93332e-001 +data8 0x3fd95bd750ee3ed0, 0x3ca4aa7cdb6dd8a8 // log(1/frcpa(1+124/256))= +3.96231e-001 +data8 0x3fd98b7811a3ee58, 0x3caa93a5b660893e // log(1/frcpa(1+125/256))= +3.99138e-001 +data8 0x3fd9af47f33d4068, 0x3cac294b3b3190ba // log(1/frcpa(1+126/256))= +4.01323e-001 +data8 0x3fd9df270c1914a0, 0x3cbe1a58fd0cd67e // log(1/frcpa(1+127/256))= +4.04245e-001 +data8 0x3fda0325ed14fda0, 0x3cb1efa7950fb57e // log(1/frcpa(1+128/256))= +4.06442e-001 +data8 0x3fda33440224fa78, 0x3c8915fe75e7d477 // log(1/frcpa(1+129/256))= +4.09379e-001 +data8 0x3fda57725e80c380, 0x3ca72bd1062b1b7f // log(1/frcpa(1+130/256))= +4.11587e-001 +data8 0x3fda87d0165dd198, 0x3c91f7845f58dbad // log(1/frcpa(1+131/256))= +4.14539e-001 +data8 0x3fdaac2e6c03f890, 0x3cb6f237a911c509 // log(1/frcpa(1+132/256))= +4.16759e-001 +data8 0x3fdadccc6fdf6a80, 0x3c90ddc4b7687169 // log(1/frcpa(1+133/256))= +4.19726e-001 +data8 0x3fdb015b3eb1e790, 0x3c692dd7d90e1e8e // log(1/frcpa(1+134/256))= +4.21958e-001 +data8 0x3fdb323a3a635948, 0x3c6f85655cbe14de // log(1/frcpa(1+135/256))= +4.24941e-001 +data8 0x3fdb56fa04462908, 0x3c95252d841994de // log(1/frcpa(1+136/256))= +4.27184e-001 +data8 0x3fdb881aa659bc90, 0x3caa53a745a3642f // log(1/frcpa(1+137/256))= +4.30182e-001 +data8 0x3fdbad0bef3db160, 0x3cb32f2540dcc16a // log(1/frcpa(1+138/256))= +4.32437e-001 +data8 0x3fdbd21297781c28, 0x3cbd8e891e106f1d // log(1/frcpa(1+139/256))= +4.34697e-001 +data8 0x3fdc039236f08818, 0x3c809435af522ba7 // log(1/frcpa(1+140/256))= +4.37718e-001 +data8 0x3fdc28cb1e4d32f8, 0x3cb3944752fbd81e // log(1/frcpa(1+141/256))= +4.39990e-001 +data8 0x3fdc4e19b84723c0, 0x3c9a465260cd3fe5 // log(1/frcpa(1+142/256))= +4.42267e-001 +data8 0x3fdc7ff9c74554c8, 0x3c92447d5b6ca369 // log(1/frcpa(1+143/256))= +4.45311e-001 +data8 0x3fdca57b64e9db00, 0x3cb44344a8a00c82 // log(1/frcpa(1+144/256))= +4.47600e-001 +data8 0x3fdccb130a5ceba8, 0x3cbefaddfb97b73f // log(1/frcpa(1+145/256))= +4.49895e-001 +data8 0x3fdcf0c0d18f3268, 0x3cbd3e7bfee57898 // log(1/frcpa(1+146/256))= +4.52194e-001 +data8 0x3fdd232075b5a200, 0x3c9222599987447c // log(1/frcpa(1+147/256))= +4.55269e-001 +data8 0x3fdd490246defa68, 0x3cabafe9a767a80d // log(1/frcpa(1+148/256))= +4.57581e-001 +data8 0x3fdd6efa918d25c8, 0x3cb58a2624e1c6fd // log(1/frcpa(1+149/256))= +4.59899e-001 +data8 0x3fdd9509707ae528, 0x3cbdc3babce578e7 // log(1/frcpa(1+150/256))= +4.62221e-001 +data8 0x3fddbb2efe92c550, 0x3cb0ac0943c434a4 // log(1/frcpa(1+151/256))= +4.64550e-001 +data8 0x3fddee2f3445e4a8, 0x3cbba9d07ce820e8 // log(1/frcpa(1+152/256))= +4.67663e-001 +data8 0x3fde148a1a2726c8, 0x3cb6537e3375b205 // log(1/frcpa(1+153/256))= +4.70004e-001 +data8 0x3fde3afc0a49ff38, 0x3cbfed5518dbc20e // log(1/frcpa(1+154/256))= +4.72350e-001 +data8 0x3fde6185206d5168, 0x3cb6572601f73d5c // log(1/frcpa(1+155/256))= +4.74702e-001 +data8 0x3fde882578823d50, 0x3c9b24abd4584d1a // log(1/frcpa(1+156/256))= +4.77060e-001 +data8 0x3fdeaedd2eac9908, 0x3cb0ceb5e4d2c8f7 // log(1/frcpa(1+157/256))= +4.79423e-001 +data8 0x3fded5ac5f436be0, 0x3ca72f21f1f5238e // log(1/frcpa(1+158/256))= +4.81792e-001 +data8 0x3fdefc9326d16ab8, 0x3c85081a1639a45c // log(1/frcpa(1+159/256))= +4.84166e-001 +data8 0x3fdf2391a21575f8, 0x3cbf11015bdd297a // log(1/frcpa(1+160/256))= +4.86546e-001 +data8 0x3fdf4aa7ee031928, 0x3cb3795bc052a2d1 // log(1/frcpa(1+161/256))= +4.88932e-001 +data8 0x3fdf71d627c30bb0, 0x3c35c61f0f5a88f3 // log(1/frcpa(1+162/256))= +4.91323e-001 +data8 0x3fdf991c6cb3b378, 0x3c97d99419be6028 // log(1/frcpa(1+163/256))= +4.93720e-001 +data8 0x3fdfc07ada69a908, 0x3cbfe9341ded70b1 // log(1/frcpa(1+164/256))= +4.96123e-001 +data8 0x3fdfe7f18eb03d38, 0x3cb85718a640c33f // log(1/frcpa(1+165/256))= +4.98532e-001 +data8 0x3fe007c053c5002c, 0x3cb3addc9c065f09 // log(1/frcpa(1+166/256))= +5.00946e-001 +data8 0x3fe01b942198a5a0, 0x3c9d5aa4c77da6ac // log(1/frcpa(1+167/256))= +5.03367e-001 +data8 0x3fe02f74400c64e8, 0x3cb5a0ee4450ef52 // log(1/frcpa(1+168/256))= +5.05793e-001 +data8 0x3fe04360be7603ac, 0x3c9dd00c35630fe0 // log(1/frcpa(1+169/256))= +5.08225e-001 +data8 0x3fe05759ac47fe30, 0x3cbd063e1f0bd82c // log(1/frcpa(1+170/256))= +5.10663e-001 +data8 0x3fe06b5f1911cf50, 0x3cae8da674af5289 // log(1/frcpa(1+171/256))= +5.13107e-001 +data8 0x3fe078bf0533c568, 0x3c62241edf5fd1f7 // log(1/frcpa(1+172/256))= +5.14740e-001 +data8 0x3fe08cd9687e7b0c, 0x3cb3007febcca227 // log(1/frcpa(1+173/256))= +5.17194e-001 +data8 0x3fe0a10074cf9018, 0x3ca496e84603816b // log(1/frcpa(1+174/256))= +5.19654e-001 +data8 0x3fe0b5343a234474, 0x3cb46098d14fc90a // log(1/frcpa(1+175/256))= +5.22120e-001 +data8 0x3fe0c974c89431cc, 0x3cac0a7cdcbb86c6 // log(1/frcpa(1+176/256))= +5.24592e-001 +data8 0x3fe0ddc2305b9884, 0x3cb2f753210410ff // log(1/frcpa(1+177/256))= +5.27070e-001 +data8 0x3fe0eb524bafc918, 0x3c88affd6682229e // log(1/frcpa(1+178/256))= +5.28726e-001 +data8 0x3fe0ffb54213a474, 0x3cadeefbab9af993 // log(1/frcpa(1+179/256))= +5.31214e-001 +data8 0x3fe114253da97d9c, 0x3cbaf1c2b8bc160a // log(1/frcpa(1+180/256))= +5.33709e-001 +data8 0x3fe128a24f1d9afc, 0x3cb9cf4df375e650 // log(1/frcpa(1+181/256))= +5.36210e-001 +data8 0x3fe1365252bf0864, 0x3c985a621d4be111 // log(1/frcpa(1+182/256))= +5.37881e-001 +data8 0x3fe14ae558b4a92c, 0x3ca104c4aa8977d1 // log(1/frcpa(1+183/256))= +5.40393e-001 +data8 0x3fe15f85a19c7658, 0x3cbadf26e540f375 // log(1/frcpa(1+184/256))= +5.42910e-001 +data8 0x3fe16d4d38c119f8, 0x3cb3aea11caec416 // log(1/frcpa(1+185/256))= +5.44592e-001 +data8 0x3fe18203c20dd130, 0x3cba82d1211d1d6d // log(1/frcpa(1+186/256))= +5.47121e-001 +data8 0x3fe196c7bc4b1f38, 0x3cb6267acc4f4f4a // log(1/frcpa(1+187/256))= +5.49656e-001 +data8 0x3fe1a4a738b7a33c, 0x3c858930213c987d // log(1/frcpa(1+188/256))= +5.51349e-001 +data8 0x3fe1b981c0c9653c, 0x3c9bc2a4a30f697b // log(1/frcpa(1+189/256))= +5.53895e-001 +data8 0x3fe1ce69e8bb1068, 0x3cb7ae6199cf2a00 // log(1/frcpa(1+190/256))= +5.56447e-001 +data8 0x3fe1dc619de06944, 0x3c6b50bb38388177 // log(1/frcpa(1+191/256))= +5.58152e-001 +data8 0x3fe1f160a2ad0da0, 0x3cbd05b2778a5e1d // log(1/frcpa(1+192/256))= +5.60715e-001 +data8 0x3fe2066d7740737c, 0x3cb32e828f9c6bd6 // log(1/frcpa(1+193/256))= +5.63285e-001 +data8 0x3fe2147dba47a390, 0x3cbd579851b8b672 // log(1/frcpa(1+194/256))= +5.65001e-001 +data8 0x3fe229a1bc5ebac0, 0x3cbb321be5237ce8 // log(1/frcpa(1+195/256))= +5.67582e-001 +data8 0x3fe237c1841a502c, 0x3cb3b56e0915ea64 // log(1/frcpa(1+196/256))= +5.69306e-001 +data8 0x3fe24cfce6f80d98, 0x3cb34a4d1a422919 // log(1/frcpa(1+197/256))= +5.71898e-001 +data8 0x3fe25b2c55cd5760, 0x3cb237401ea5015e // log(1/frcpa(1+198/256))= +5.73630e-001 +data8 0x3fe2707f4d5f7c40, 0x3c9d30f20acc8341 // log(1/frcpa(1+199/256))= +5.76233e-001 +data8 0x3fe285e0842ca380, 0x3cbc4d866d5f21c0 // log(1/frcpa(1+200/256))= +5.78842e-001 +data8 0x3fe294294708b770, 0x3cb85e14d5dc54fa // log(1/frcpa(1+201/256))= +5.80586e-001 +data8 0x3fe2a9a2670aff0c, 0x3c7e6f8f468bbf91 // log(1/frcpa(1+202/256))= +5.83207e-001 +data8 0x3fe2b7fb2c8d1cc0, 0x3c930ffcf63c8b65 // log(1/frcpa(1+203/256))= +5.84959e-001 +data8 0x3fe2c65a6395f5f4, 0x3ca0afe20b53d2d2 // log(1/frcpa(1+204/256))= +5.86713e-001 +data8 0x3fe2dbf557b0df40, 0x3cb646be1188fbc9 // log(1/frcpa(1+205/256))= +5.89350e-001 +data8 0x3fe2ea64c3f97654, 0x3c96516fa8df33b2 // log(1/frcpa(1+206/256))= +5.91113e-001 +data8 0x3fe3001823684d70, 0x3cb96d64e16d1360 // log(1/frcpa(1+207/256))= +5.93762e-001 +data8 0x3fe30e97e9a8b5cc, 0x3c98ef96bc97cca0 // log(1/frcpa(1+208/256))= +5.95531e-001 +data8 0x3fe32463ebdd34e8, 0x3caef1dc9a56c1bf // log(1/frcpa(1+209/256))= +5.98192e-001 +data8 0x3fe332f4314ad794, 0x3caa4f0ac5d5fa11 // log(1/frcpa(1+210/256))= +5.99970e-001 +data8 0x3fe348d90e7464cc, 0x3cbe7889f0516acd // log(1/frcpa(1+211/256))= +6.02643e-001 +data8 0x3fe35779f8c43d6c, 0x3ca96bbab7245411 // log(1/frcpa(1+212/256))= +6.04428e-001 +data8 0x3fe36621961a6a98, 0x3ca31f32262db9fb // log(1/frcpa(1+213/256))= +6.06217e-001 +data8 0x3fe37c299f3c3668, 0x3cb15c72c107ee29 // log(1/frcpa(1+214/256))= +6.08907e-001 +data8 0x3fe38ae2171976e4, 0x3cba42a2554b2dd4 // log(1/frcpa(1+215/256))= +6.10704e-001 +data8 0x3fe399a157a603e4, 0x3cb99c62286d8919 // log(1/frcpa(1+216/256))= +6.12504e-001 +data8 0x3fe3afccfe77b9d0, 0x3ca11048f96a43bd // log(1/frcpa(1+217/256))= +6.15210e-001 +data8 0x3fe3be9d503533b4, 0x3ca4022f47588c3e // log(1/frcpa(1+218/256))= +6.17018e-001 +data8 0x3fe3cd7480b4a8a0, 0x3cb4ba7afc2dc56a // log(1/frcpa(1+219/256))= +6.18830e-001 +data8 0x3fe3e3c43918f76c, 0x3c859673d064b8ba // log(1/frcpa(1+220/256))= +6.21554e-001 +data8 0x3fe3f2acb27ed6c4, 0x3cb55c6b452a16a8 // log(1/frcpa(1+221/256))= +6.23373e-001 +data8 0x3fe4019c2125ca90, 0x3cb8c367879c5a31 // log(1/frcpa(1+222/256))= +6.25197e-001 +data8 0x3fe4181061389720, 0x3cb2c17a79c5cc6c // log(1/frcpa(1+223/256))= +6.27937e-001 +data8 0x3fe42711518df544, 0x3ca5f38d47012fc5 // log(1/frcpa(1+224/256))= +6.29769e-001 +data8 0x3fe436194e12b6bc, 0x3cb9854d65a9b426 // log(1/frcpa(1+225/256))= +6.31604e-001 +data8 0x3fe445285d68ea68, 0x3ca3ff9b3a81cd81 // log(1/frcpa(1+226/256))= +6.33442e-001 +data8 0x3fe45bcc464c8938, 0x3cb0a2d8011a6c05 // log(1/frcpa(1+227/256))= +6.36206e-001 +data8 0x3fe46aed21f117fc, 0x3c8a2be41f8e9f3d // log(1/frcpa(1+228/256))= +6.38053e-001 +data8 0x3fe47a1527e8a2d0, 0x3cba4a83594fab09 // log(1/frcpa(1+229/256))= +6.39903e-001 +data8 0x3fe489445efffcc8, 0x3cbf306a23dcbcde // log(1/frcpa(1+230/256))= +6.41756e-001 +data8 0x3fe4a018bcb69834, 0x3ca46c9285029fd1 // log(1/frcpa(1+231/256))= +6.44543e-001 +data8 0x3fe4af5a0c9d65d4, 0x3cbbc1db897580e3 // log(1/frcpa(1+232/256))= +6.46405e-001 +data8 0x3fe4bea2a5bdbe84, 0x3cb84d880d7ef775 // log(1/frcpa(1+233/256))= +6.48271e-001 +data8 0x3fe4cdf28f10ac44, 0x3cb3ec4b7893ce1f // log(1/frcpa(1+234/256))= +6.50140e-001 +data8 0x3fe4dd49cf994058, 0x3c897224d59d3408 // log(1/frcpa(1+235/256))= +6.52013e-001 +data8 0x3fe4eca86e64a680, 0x3cbccf620f24f0cd // log(1/frcpa(1+236/256))= +6.53889e-001 +data8 0x3fe503c43cd8eb68, 0x3c3f872c65971084 // log(1/frcpa(1+237/256))= +6.56710e-001 +data8 0x3fe513356667fc54, 0x3cb9ca64cc3d52c8 // log(1/frcpa(1+238/256))= +6.58595e-001 +data8 0x3fe522ae0738a3d4, 0x3cbe708164c75968 // log(1/frcpa(1+239/256))= +6.60483e-001 +data8 0x3fe5322e26867854, 0x3cb9988ba4aea615 // log(1/frcpa(1+240/256))= +6.62376e-001 +data8 0x3fe541b5cb979808, 0x3ca1662e3a6b95f5 // log(1/frcpa(1+241/256))= +6.64271e-001 +data8 0x3fe55144fdbcbd60, 0x3cb3acd4ca45c1e0 // log(1/frcpa(1+242/256))= +6.66171e-001 +data8 0x3fe560dbc45153c4, 0x3cb4988947959fed // log(1/frcpa(1+243/256))= +6.68074e-001 +data8 0x3fe5707a26bb8c64, 0x3cb3017fe6607ba9 // log(1/frcpa(1+244/256))= +6.69980e-001 +data8 0x3fe587f60ed5b8fc, 0x3cbe7a3266366ed4 // log(1/frcpa(1+245/256))= +6.72847e-001 +data8 0x3fe597a7977c8f30, 0x3ca1e12b9959a90e // log(1/frcpa(1+246/256))= +6.74763e-001 +data8 0x3fe5a760d634bb88, 0x3cb7c365e53d9602 // log(1/frcpa(1+247/256))= +6.76682e-001 +data8 0x3fe5b721d295f10c, 0x3cb716c2551ccbf0 // log(1/frcpa(1+248/256))= +6.78605e-001 +data8 0x3fe5c6ea94431ef8, 0x3ca02b2ed0e28261 // log(1/frcpa(1+249/256))= +6.80532e-001 +data8 0x3fe5d6bb22ea86f4, 0x3caf43a8bbb2f974 // log(1/frcpa(1+250/256))= +6.82462e-001 +data8 0x3fe5e6938645d38c, 0x3cbcedc98821b333 // log(1/frcpa(1+251/256))= +6.84397e-001 +data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.86335e-001 +data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001 +data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001 +data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001 +ASM_SIZE_DIRECTIVE(pow_Tt) + + +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +pow_tbl1: +ASM_TYPE_DIRECTIVE(pow_tbl1,@object) +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +ASM_SIZE_DIRECTIVE(pow_tbl1) + + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +pow_tbl2: +ASM_TYPE_DIRECTIVE(pow_tbl2,@object) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +ASM_SIZE_DIRECTIVE(pow_tbl2) + +.global powf + +.section .text +.proc powf +.align 32 + +powf: + +{ .mfi + alloc r32=ar.pfs,1,35,4,0 + fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 + mov pow_GR_17ones = 0x1FFFF +} +{ .mfi +(p0) addl pow_AD_P = @ltoff(pow_table_P), gp + fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 + nop.i 999 +;; +} + + +// Get exponent of x. Will be used to calculate K. +{ .mfi + getf.exp pow_GR_signexp_X = f8 + frcpa.s1 POW_B, p6 = f1,f8 + nop.i 999 +} +{ .mfi + ld8 pow_AD_P = [pow_AD_P] + fma.s1 POW_NORM_X = f8,f1,f0 + mov pow_GR_FFF7 = 0xFFF7 +} +;; + + + +// Get significand of x. Will be used to get index to fetch T, Tt. +// p13 = TRUE ==> X is unorm +// DOUBLE 0x10033 exponent limit at which y is an integer +// SINGLE 0x10016 +{ .mfi + getf.sig pow_GR_sig_X = f8 + fclass.m p13,p0 = f8, 0x0b // Test for x unorm + addl pow_GR_10033 = 0x10033, r0 +} +{ .mfi + mov pow_GR_16ones = 0xFFFF + fma.s1 POW_NORM_Y = f9,f1,f0 + nop.i 999 +} +;; + + +// p14 = TRUE ==> X is ZERO +{ .mfi + adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P + fclass.m p14,p15 = f8, 0x07 + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones +} +{ .mfi + adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + ldfe POW_P5 = [pow_AD_P], 16 + fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 + shl pow_GR_offset = pow_GR_sig_X, 1 +} +{ .mib + ldfe POW_P4 = [pow_AD_Q], 16 + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones +(p13) br.cond.spnt L(POW_X_DENORM) +} +;; + + +// Continue normal and denormal paths here +L(POW_COMMON): +// p11 = TRUE ==> Y is a NAN +{ .mfi + ldfe POW_P3 = [pow_AD_P], 16 + fclass.m.unc p11,p0 = f9, 0xc3 + shr.u pow_GR_offset = pow_GR_offset,56 +} +{ .mfi + ldfe POW_P2 = [pow_AD_Q], 16 + nop.f 999 + nop.i 999 +} +;; + + + +// Compute xsq to decide later if |x|=1 +// p11 = TRUE ==> Y is a NaN +{ .mfi + setf.sig POW_int_K = pow_GR_true_exp_X +(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1 + shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt +} +{ .mfi + nop.m 999 +(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 + nop.i 999 +} +;; + + + +// p12 = TRUE ==> X is ZERO and Y is ZERO +{ .mfi + ldfe POW_P1 = [pow_AD_P], 16 +(p14) fclass.m.unc p12,p0 = f9, 0x07 + nop.i 999 +} +{ .mfb + ldfe POW_P0 = [pow_AD_Q], 16 + fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0 +(p11) br.cond.spnt L(POW_Y_NAN) +} +;; + + +.pred.rel "mutex",p8,p9 +// Get exponent of |x|-1 to use in comparison to 2^-8 +{ .mmf +(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1 +(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1 + fcvt.fx.s1 POW_int_Y = POW_NORM_Y +} +;; + + +// p11 = TRUE ==> X is a NAN +{ .mfi + ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16 + fclass.m.unc p11,p0 = f8, 0xc3 + nop.i 999 +} +{ .mib + ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 + nop.i 999 +(p12) br.cond.spnt L(POW_X_0_Y_0) +} +;; + + +// p14 = TRUE ==> X is zero +// p15 = TRUE ==> X is zero AND Y is negative +// p10 = TRUE ==> X is zero AND Y is >= zero +{ .mfi + ldfe POW_inv_log2_by_128 = [pow_AD_P], 16 +(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + nop.f 999 + and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones +} +;; + + +// Determine if we will use the |x| near 1 path (p6) or normal path (p7) +// p12 = TRUE ==> X is a NAN and Y is a zero +// p13 = TRUE ==> X is a NAN and Y is anything else +{ .mfi + getf.exp pow_GR_signexp_Y = POW_NORM_Y +(p11) fclass.m.unc p12,p13 = f9, 0x07 + cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7 +} +{ .mfi + ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 + fma.s1 POW_rsq = POW_r, POW_r,f0 + nop.i 999 +;; +} + +// If on the x near 1 path, assign r1 to r and r1*r1 to rsq +{ .mfi + ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 +(p6) fma.s1 POW_r = POW_r1, f1, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 + nop.i 999 +;; +} + + +{ .mfi + ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 +(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + nop.m 999 +(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 +(p12) br.cond.spnt L(POW_X_NAN_Y_0) +} +;; + + +{ .mfi + nop.m 999 +(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 + andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + nop.m 999 +(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 +(p12) br.cond.spnt L(POW_X_NAN_Y_0) +} +;; + +{ .mfi + nop.m 999 + fcvt.xf POW_K = POW_int_K + nop.i 999 +} +{ .mfb + nop.m 999 +(p13) fma.s f8 = f8,f1,f0 +(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero +} +;; + +// p10 = TRUE ==> X is zero AND Y is positive +// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int) +// return +0 +// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) +{ .mfi +(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 +(p6) fmerge.s POW_delta = f0,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fma.s1 POW_G = f0,f0,f0 + nop.i 999 +} +;; + +{ .mfi + getf.sig pow_GR_sig_int_Y = POW_int_Y + fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_U = POW_NORM_Y,POW_r,f0 + nop.i 999 +} +;; + +{ .mfi + ldfe POW_log2_by_128_lo = [pow_AD_P], 16 +(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0 + nop.i 999 +} +{ .mfi + ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 +(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fcvt.xf POW_float_int_Y = POW_int_Y + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 + adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q +} +;; + +{ .mfi + nop.m 999 +(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T + adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 +} +;; + + +{ .mfi + nop.m 999 + fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U + nop.i 999 +} +;; + +// p11 = TRUE ==> X is NEGATIVE +// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int) +// return +0 +{ .mfi + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x1a + nop.i 999 +} +{ .mfb + nop.m 999 +(p8) fma.s f8 = f0,f0,f0 +(p8) br.ret.spnt b0 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 + nop.i 999 +} +;; + + +// p11 = TRUE ==> X is NEGATIVE +// p12 = TRUE ==> X is NEGATIVE AND Y already int +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +{ .mfi + nop.m 999 + fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 +(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 +} +{ .mfi + nop.m 999 + fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 + nop.i 999 +} +;; + +// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) +// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) +// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0 +{ .mfi + nop.m 999 +(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_Gpr = POW_G, f1, POW_r + nop.i 999 +} +;; + +// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand +{ .mfi + nop.m 999 + fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF + nop.i 999 +} +{ .mfi + nop.m 999 + fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 + nop.i 999 +} +;; + + +// If x=0 and y>0, test y and flag denormal +// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) +// p8 = TRUE ==> X is zero AND Y is an odd integer +// p9 = TRUE ==> X is zero AND Y is an even integer +{ .mfi + nop.m 999 +(p10) fcmp.eq.s0 p15,p0 = f9,f0 +(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0 +} +{ .mfi + nop.m 999 + fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 + nop.i 999 +} +;; + +// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand +{ .mfi + nop.m 999 + fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_W1 = POW_Z1, POW_inv_log2_by_128, POW_RSHF + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p7) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y not integer + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 +(p8) br.ret.spnt b0 // Exit if x zero and y odd integer +} +;; + +// By subtracting RSHF we get rounded integer POW_N2float +// p15 = TRUE ==> X_0_Y_NEG +{ .mfi + nop.m 999 + fms.s1 POW_N2float = POW_W2, f1, POW_RSHF + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 +(p15) br.cond.spnt L(POW_X_0_Y_NEG) +} +;; + + + +{ .mfi + nop.m 999 + fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 +(p7) br.ret.spnt b0 // Exit if x zero and y not an integer +} +;; + + + +// Extract rounded integer from rightmost significand of POW_W2 +// By subtracting RSHF we get rounded integer POW_N1float +{ .mfi + getf.sig pow_GR_int_W2 = POW_W2 + fms.s1 POW_N1float = POW_W1, f1, POW_RSHF + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half + nop.i 999 +} +;; + + + + +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +// p10 = TRUE ==> X is NEG and Y is an int +// p12 = TRUE ==> X is NEG and Y is not an int +{ .mfi + nop.m 999 +(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y even integer +(p9) br.ret.spnt b0 // Exit if x zero and y even integer +} +;; + + +{ .mfi + nop.m 999 + fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV + nop.i 999 +} +;; + +// Extract rounded integer from rightmost significand of POW_W1 +// Test if x inf +{ .mfi + getf.sig pow_GR_int_W1 = POW_W1 + fclass.m.unc p15,p0 = POW_NORM_X, 0x23 + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1 +(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer +} +;; + +// p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer +{ .mfi + getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr + fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 +(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 +} +;; + + +{ .mfi + add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 + fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1 + nop.i 999 +} +{ .mfb + nop.m 999 + fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1 +(p15) br.cond.spnt L(POW_X_INF) +} +;; + + +// Test x and y and flag denormal +{ .mfi + and pow_GR_index1 = 0x0f, pow_GR_int_N + fcmp.eq.s0 p15,p0 = f8,f9 + shr r2 = pow_GR_int_N, 7 +} +{ .mfi + and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones + nop.f 999 + and pow_GR_index2 = 0x70, pow_GR_int_N +} +;; + + + +{ .mfi + shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0 + sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones +} +{ .mfi + addl pow_int_GR_M = 0xFFFF, r2 + fma.s1 POW_e12 = POW_e1,f1,POW_e2 + add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 +} +;; + + +{ .mmi + ldfe POW_T1 = [pow_AD_T1],16 + setf.exp POW_2M = pow_int_GR_M + andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones +} +;; + + +{ .mfb + ldfe POW_T2 = [pow_AD_T2],16 + fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 +(p7) br.ret.spnt b0 // Early exit if y=1.0, result is x +} +;; + + +// double: p8 TRUE ==> |Y(G + r)| >= 10 +// single: p8 TRUE ==> |Y(G + r)| >= 7 + +// double +// -2^10 -2^9 2^9 2^10 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | +// single +// -2^7 -2^6 2^6 2^7 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | + + +{ .mfi +(p0) cmp.le.unc p8,p9 = 7, pow_GR_true_exp_Y_Gpr + fma.s1 POW_s = POW_s1, f1, POW_s2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_f12 = POW_f1, POW_f2,f0 + nop.i 999 +} +;; + + +{ .mfi + nop.f 999 +(p9) cmp.le.unc p0,p10 = 6, pow_GR_true_exp_Y_Gpr +} +;; + + + +{ .mfb + nop.m 999 + fma.s1 POW_e123 = POW_e12, f1, POW_e3 +(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF) +} +;; + + +{ .mmf + fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_ssq = POW_s, POW_s, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_1ps = f1,f1,POW_s + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_f3 = POW_e123,f1,f1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_f123 = POW_f12, POW_f3, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_2M, POW_T1T2, f0 + nop.i 999 +} +;; + + + +{ .mfi + nop.m 999 +(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int + nop.i 999 +} +{ .mfi + nop.m 999 +// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_A, POW_f123, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fma.s1 POW_A = POW_A, POW_es,f0 + nop.i 999 +} +;; + + + +{ .mfb + nop.m 999 +(p10) fma.s f8 = POW_A, POW_q, POW_A +(p10) br.ret.sptk b0 +} +;; + + + + + +// POSSIBLE_OVER_UNDER +// p6 = TRUE ==> Y negative + +{ .mfi + nop.m 999 + fmerge.s POW_abs_A = f0, POW_A + cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0 +} +;; + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(POW_POSSIBLE_UNDER) +} +;; + +// POSSIBLE_OVER +// We got an answer. +// overflow is a possibility, not a certainty + + +// We define an overflow when the answer with +// WRE set +// user-defined rounding mode + +// double +// Largest double is 7FE (biased double) +// 7FE - 3FF + FFFF = 103FE +// Create + largest_double_plus_ulp +// Create - largest_double_plus_ulp +// Calculate answer with WRE set. + +// single +// Largest single is FE (biased double) +// FE - 7F + FFFF = 1007E +// Create + largest_single_plus_ulp +// Create - largest_single_plus_ulp +// Calculate answer with WRE set. + +// Cases when answer is ldn+1 are as follows: +// ldn ldn+1 +// --+----------|----------+------------ +// | +// +inf +inf -inf +// RN RN +// RZ + + +// Put in s2 (td set, wre set) +{ .mfi + mov pow_GR_gt_ln = 0x1007f + fsetc.s2 0x7F,0x42 + nop.i 999 +} +;; + + +{ .mfi + setf.exp POW_gt_pln = pow_GR_gt_ln + fma.s.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A + nop.i 999 ;; +} + +// Return s2 to default +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + + +// p7 = TRUE ==> yes, we have an overflow +{ .mfi + nop.m 999 + fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln + nop.i 999 +} +;; + + + +{ .mfb +(p7) mov pow_GR_tag = 30 + fma.s f8 = POW_A, POW_q, POW_A +(p7) br.cond.spnt __libm_error_region +} +{ .mfb + nop.m 999 + nop.f 999 +(p0) br.ret.sptk b0 +} +;; + + +L(POW_POSSIBLE_UNDER): +// We got an answer. input was < -2^9 but > -2^10 (double) +// We got an answer. input was < -2^6 but > -2^7 (float) +// underflow is a possibility, not a certainty + +// We define an underflow when the answer with +// ftz set +// is zero (tiny numbers become zero) +// Notice (from below) that if we have an unlimited exponent range, +// then there is an extra machine number E between the largest denormal and +// the smallest normal. +// So if with unbounded exponent we round to E or below, then we are +// tiny and underflow has occurred. +// But notice that you can be in a situation where we are tiny, namely +// rounded to E, but when the exponent is bounded we round to smallest +// normal. So the answer can be the smallest normal with underflow. +// E +// -----+--------------------+--------------------+----- +// | | | +// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe +// 0.1...11 2^-3ffe (biased, 1) +// largest dn smallest normal + + +// Put in s2 (td set, ftz set) +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x41 + nop.i 999 +} +;; + + + +{ .mfi + nop.m 999 + fma.s.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A + nop.i 999 +} +;; + + +// Return s2 to default +{ .mfi + nop.m 999 + fsetc.s2 0x7F,0x40 + nop.i 999 +} +;; + + +// p7 = TRUE ==> yes, we have an underflow +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0 + nop.i 999 +} +;; + + + + +{ .mfb +(p7) mov pow_GR_tag = 31 + fma.s f8 = POW_A, POW_q, POW_A +(p7) br.cond.spnt __libm_error_region +} +;; + + +{ .mfb + nop.m 999 + nop.f 999 + br.ret.sptk b0 +} +;; + + +L(POW_X_DENORM): +// Here if x unorm. Use the NORM_X for getf instructions, and the back +// to normal path +{ .mfi + getf.exp pow_GR_signexp_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + getf.sig pow_GR_sig_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; + +{ .mfi + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + nop.f 999 +} +;; + +{ .mib + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones + shl pow_GR_offset = pow_GR_sig_X, 1 + br.cond.sptk L(POW_COMMON) +} +;; + + +L(POW_X_0_Y_0): +// When X is +-0 and Y is +-0, IEEE returns 1.0 +// We call error support with this value + +{ .mfb + mov pow_GR_tag = 32 + fma.s f8 = f1,f1,f0 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_INF): +// When X is +-inf and Y is +-, IEEE returns + +// overflow +// X +inf Y +inf +inf +// X -inf Y +inf +inf + +// X +inf Y >0 +inf +// X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !! +// X -inf Y >0, odd integer -inf + +// underflow +// X +inf Y -inf +0 +// X -inf Y -inf +0 + +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 + +// X + inf Y=+0 +1 +// X + inf Y=-0 +1 +// X - inf Y=+0 +1 +// X - inf Y=-0 +1 + +// p13 == Y negative +// p14 == Y positive + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 + + + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// p13 == (Y negative) +// return (sign_of_x)inf +// p14 == (Y positive) +// return (sign_of_x)0 +// pxx even +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 + +// pxx == Y is not an integer +// p13 == (Y negative) +// return +inf +// p14 == (Y positive) +// return +0 +// + +// If x=inf, test y and flag denormal +{ .mfi + nop.m 999 + fcmp.eq.s0 p10,p11 = f9,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fcmp.lt p13,p14 = POW_NORM_Y,f0 + cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 +} +{ .mfi + nop.m 999 + fclass.m p12,p0 = f9, 0x23 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fclass.m p15,p0 = f9, 0x07 //@zero + nop.i 999 +} +;; + +{ .mfb + nop.m 999 +(p15) fmerge.s f8 = f1,f1 +(p15) br.ret.spnt b0 +} +;; + + +{ .mfi +(p13) mov pow_GR_tag = 31 +(p14) frcpa.s1 f8,p10 = f1,f0 + nop.i 999 +} +{ .mfb +(p14) mov pow_GR_tag = 30 +(p13) fma.s1 f8 = f0,f0,f0 +(p12) br.ret.spnt b0 +} +;; + + + +{ .mfb + nop.m 999 +(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y + nop.b 999 +} +;; + +{ .mfi + nop.m 999 + nop.f 999 +(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 +} +;; + +{ .mfb + nop.m 999 +(p11) fmerge.s f8 = POW_NORM_X,f8 + br.ret.sptk b0 +} +;; + + + +L(POW_X_0_Y_NEG): +// When X is +-0 and Y is negative, IEEE returns +// X Y answer +// +0 -odd int +inf +// -0 -odd int -inf + +// +0 !-odd int +inf +// -0 !-odd int +inf + + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// return +inf + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// return (sign_of_x)inf +// p12 even +// return +inf +// p10 == Y is not an integer +// return +inf +// +// + +{ .mfi + nop.m 999 + nop.f 999 + cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 +} +;; + + +{ .mfi + mov pow_GR_tag = 33 +(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y + nop.i 999 +} +;; + + +{ .mfb + nop.m 999 +(p6) frcpa.s0 f8,p13 = f1, f0 +(p6) br.cond.sptk __libm_error_region +} +;; + +{ .mfb + nop.m 999 +(p10) frcpa.s0 f8,p13 = f1, f0 +(p10) br.cond.sptk __libm_error_region +} +;; + + + +{ .mib + nop.m 999 +(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 + nop.b 999 +} +;; + + + +{ .mfi + nop.m 999 +(p12) frcpa.s0 f8,p13 = f1,f0 + nop.i 999 +} +;; + +{ .mfb + nop.m 999 +(p11) frcpa f8,p13 = f1,f8 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_NEG_Y_NONINT): +// When X is negative and Y is a non-integer, IEEE +// returns a qnan indefinite. +// We call error support with this value + +{ .mfb + mov pow_GR_tag = 34 + frcpa f8,p6 = f0,f0 + br.cond.sptk __libm_error_region +} +;; + + + + +L(POW_X_NAN_Y_0): +// When X is a NAN and Y is zero, IEEE returns 1. +// We call error support with this value. + +{ .mfi + nop.m 0 + fma.s.s0 f10 = f8,f1,f0 + nop.i 0 +} +{ .mfb + mov pow_GR_tag = 35 + fma.s.s0 f8 = f0,f0,f1 + br.cond.sptk __libm_error_region +} +;; + + +L(POW_OVER_UNDER_X_NOT_INF): + +// p8 is TRUE for overflow +// p9 is TRUE for underflow + +// if y is infinity, we should not over/underflow + + +{ .mfi + nop.m 999 + fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1 + cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0 +} +;; + +{ .mfi + nop.m 999 +(p14) fclass.m.unc p15, p0 = f9, 0x23 + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fclass.m.unc p11,p0 = f9, 0x23 + nop.i 999 +} +;; + +// p15 = TRUE if |x|=1, y=inf, return +1 +{ .mfb + nop.m 999 +(p15) fma.s f8 = f1,f1,f0 +(p15) br.ret.spnt b0 +} +;; + +.pred.rel "mutex",p8,p9 +{ .mfb +(p8) setf.exp f8 = pow_GR_17ones +(p9) fmerge.s f8 = f0,f0 +(p11) br.ret.sptk b0 +} + +{ .mfb + nop.m 999 + nop.f 999 + br.cond.sptk L(POW_OVER_UNDER_ERROR) +} +;; + +L(POW_Y_NAN): + +// Is x = +1 then result is +1, else result is quiet Y +{ .mfi + nop.m 999 + fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p10) fcmp.eq.s0 p6,p0 = f9,f1 // Set invalid, even if x=+1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p10) fma.s f8 = f1,f1,f0 + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fma.s f8 = f9,f8,f0 + br.ret.sptk b0 +} +;; + + +L(POW_OVER_UNDER_ERROR): + +{ .mfi + nop.m 999 + fmerge.s f10 = POW_NORM_X,POW_NORM_X + nop.i 999 +} +{ .mfi + sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 + nop.f 999 + mov pow_GR_one = 0x1 +} +;; + +// overflow +{ .mmb +(p8) mov pow_GR_tag = 30 +(p8) setf.exp f11 = pow_GR_17ones_m1 + nop.b 999 +} +;; + + +// underflow +{ .mmi +(p9) mov pow_GR_tag = 31 +(p9) setf.exp f11 = pow_GR_one + nop.i 999 +} +;; + + +// p12 x is negative and y is an odd integer + + +{ .mfi + nop.m 999 + fma.s f8 = f11, f11, f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p12) fmerge.ns f8 = f8, f8 + nop.i 999 +} +;; + + +.endp powf +ASM_SIZE_DIRECTIVE(powf) + + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + + +.proc __libm_error_region +__libm_error_region: + +// Answer is inf for overflow and 0 for underflow. +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfs [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfs [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_powl.S b/sysdeps/ia64/fpu/e_powl.S new file mode 100644 index 0000000000..3b990444df --- /dev/null +++ b/sysdeps/ia64/fpu/e_powl.S @@ -0,0 +1,3437 @@ +.file "powl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// Function: powl(x,y), where +// y +// powl(x,y) = x , for double extended precision x and y values +// +// ********************************************************************* +// +// History: +// 2/02/00 (Hand Optimized) +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/22/01 Corrected results for powl(1,inf), powl(1,nan), and +// powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings. +// 2/06/01 Call __libm_error support if over/underflow when y=2. +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: +// f8 (Input and Return Value) +// f9-f15,f32-f63,f99 +// +// General Purpose Registers: +// Locals r32 - r61 +// Parameters to __libm_error_support r62,r63,r64,r65 +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// Special Cases and IEEE special conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions raised when appropriate for pow +// Underflow exceptions raised when appropriate for pow +// (Error Handling Routine called for overflow and Underflow) +// Inexact raised when appropriate by algorithm +// +// 1. (anything) ** NatVal or (NatVal) ** anything is NatVal +// 2. X or Y unsupported or sNaN is qNaN/Invalid +// 3. (anything) ** 0 is 1 +// 4. (anything) ** 1 is itself +// 5. (anything except 1) ** qNAN is qNAN +// 6. qNAN ** (anything except 0) is qNAN +// 7. +-(|x| > 1) ** +INF is +INF +// 8. +-(|x| > 1) ** -INF is +0 +// 9. +-(|x| < 1) ** +INF is +0 +// 10. +-(|x| < 1) ** -INF is +INF +// 11. +-1 ** +-INF is +1 +// 12. +0 ** (+anything except 0, NAN) is +0 +// 13. -0 ** (+anything except 0, NAN, odd integer) is +0 +// 14. +0 ** (-anything except 0, NAN) is +INF/div_0 +// 15. -0 ** (-anything except 0, NAN, odd integer) is +INF/div_0 +// 16. -0 ** (odd integer) = -( +0 ** (odd integer) ) +// 17. +INF ** (+anything except 0,NAN) is +INF +// 18. +INF ** (-anything except 0,NAN) is +0 +// 19. -INF ** (anything except NAN) = -0 ** (-anything) +// 20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer) +// 21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid +// 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled, +// generate denorm/unorm fault except if invalid or div_0 raised. +// +// ********************************************************************* +// +// Algorithm +// ========= +// +// Special Cases +// +// If Y = 2, return X*X. +// If Y = 0.5, return sqrt(X). +// +// Compute log(X) to extra precision. +// +// ker_log_80( X, logX_hi, logX_lo, Safe ); +// +// ...logX_hi + logX_lo approximates log(X) to roughly 80 +// ...significant bits of accuracy. +// +// Compute Y*log(X) to extra precision. +// +// P_hi := Y * logX_hi +// P_lo := Y * logX_hi - P_hi ...using FMA +// P_lo := Y * logX_lo + P_lo ...using FMA +// +// Compute exp(P_hi + P_lo) +// +// Flag := 2; +// Expo_Range := 2; (assuming double-extended power function) +// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, +// Z_hi, Z_lo, scale, Safe ) +// +// scale := sgn * scale +// +// If (Safe) then ...result will not over/underflow +// return scale*Z_hi + (scale*Z_lo) +// quickly +// Else +// take necessary precaution in computing +// scale*Z_hi + (scale*Z_lo) +// to set possible exceptions correctly. +// End If +// +// Case_Y_Special +// +// ...Follow the order of the case checks +// +// If Y is +-0, return +1 without raising any exception. +// If Y is +1, return X without raising any exception. +// If Y is qNaN, return Y without exception. +// If X is qNaN, return X without exception. +// +// At this point, X is real and Y is +-inf. +// Thus |X| can only be 1, strictly bigger than 1, or +// strictly less than 1. +// +// If |X| < 1, then +// return ( Y == +inf? +0 : +inf ) +// elseif |X| > 1, then +// return ( Y == +inf? +0 : +inf ) +// else +// goto Case_Invalid +// +// Case_X_Special +// +// ...Follow the order of the case checks +// ...Note that Y is real, finite, non-zero, and not +1. +// +// If X is qNaN, return X without exception. +// +// If X is +-0, +// return ( Y > 0 ? +0 : +inf ) +// +// If X is +inf +// return ( Y > 0 ? +inf : +0 ) +// +// If X is -inf +// return -0 ** -Y +// return ( Y > 0 ? +inf : +0 ) +// +// Case_Invalid +// +// Return 0 * inf to generate a quiet NaN together +// with an invalid exception. +// +// Implementation +// ============== +// +// We describe the quick branch since this part is important +// in reaching the normal case efficiently. +// +// STAGE 1 +// ------- +// This stage contains two threads. +// +// Stage1.Thread1 +// +// fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or +// +-0, +-infinity +// +// fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or +// +-(0, unnorm, norm, infinity) +// +// X_norm := fnorm( X ) with traps disabled +// +// If (X_excep) goto Filtering (Step 2) +// If (X_unsupp) goto Filtering (Step 2) +// +// Stage1.Thread2 +// .............. +// +// fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or +// +-0, +-infinity +// +// fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or +// +-(0, unnorm, norm, infinity) +// +// Y_norm := fnorm( Y ) with traps disabled +// +// If (Y_excep) goto Filtering (Step 2) +// If (Y_unsupp) goto Filtering (Step 2) +// +// +// STAGE 2 +// ------- +// This stage contains two threads. +// +// Stage2.Thread1 +// .............. +// +// Set X_lt_0 if X < 0 (using fcmp) +// sgn := +1.0 +// If (X_lt_0) goto Filtering (Step 2) +// +// Stage2.Thread2 +// .............. +// +// Set Y_is_1 if Y = +1 (using fcmp) +// If (Y_is_1) goto Filtering (Step 2) +// +// STAGE 3 +// ------- +// This stage contains two threads. +// +// +// Stage3.Thread1 +// .............. +// +// X := fnorm(X) in prevailing traps +// +// +// Stage3.Thread2 +// .............. +// +// Y := fnorm(Y) in prevailing traps +// +// STAGE 4 +// ------- +// +// Go to Case_Normal. +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +// Inv_L, L_hi, L_lo +.align 64 +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) + +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) + +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +// Reversed +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) + +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +// Reversed +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) + +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 +data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC +data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D +data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA +data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 +data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A +data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 +data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B +data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD +data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 +data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B +data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 +data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A +data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 +data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) + +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) + +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) + +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) + +.align 64 +Constants_log_80_P: +ASM_TYPE_DIRECTIVE(Constants_log_80_P,@object) +// 1/2, P_8, P_7, ..., P_1 +data4 0x00000000, 0x80000000, 0x00003FFE, 0x00000000 +data4 0x3B1042BC, 0xCCCE8B88, 0x0000BFFB, 0x00000000 +data4 0xCADC2149, 0xE38997B7, 0x00003FFB, 0x00000000 +data4 0xB1ACB090, 0xFFFFFFFE, 0x0000BFFB, 0x00000000 +data4 0x06481C81, 0x92492498, 0x00003FFC, 0x00000000 +data4 0xAAAAB0EF, 0xAAAAAAAA, 0x0000BFFC, 0x00000000 +data4 0xCCC91416, 0xCCCCCCCC, 0x00003FFC, 0x00000000 +data4 0x00000000, 0x80000000, 0x0000BFFD, 0x00000000 +data4 0xAAAAAAAB, 0xAAAAAAAA, 0x00003FFD +ASM_SIZE_DIRECTIVE(Constants_log_80_P) + +.align 64 +Constants_log_80_Q: +ASM_TYPE_DIRECTIVE(Constants_log_80_Q,@object) +// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 +data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 +data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 +data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000 +data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000 +data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000 +data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000 +data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0x00000000,0x80000000,0x0000BFFE,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_log_80_Q) + +.align 64 +Constants_log_80_Z_G_H_h1: +ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h1,@object) +// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double +data4 0x00008000,0x3F800000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000 +data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000 +data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000 +data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000 +data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000 +data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 +data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000 +data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 +data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 +data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000 +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 +data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 +data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 +data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 +data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 +data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 +data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000 +data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 +data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 +data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 +data4 0x00004211,0x3F042108,0x3F29516A,0x00000000 +data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h1) + +.align 64 +Constants_log_80_Z_G_H_h2: +ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h2,@object) +// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double +data4 0x00008000,0x3F800000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 +data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 +data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 +data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 +data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 +data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000 +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 +data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 +data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000 +data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 +data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 +data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 +data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 +data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 +data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 +data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 +data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 +data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h2) + +.align 64 +Constants_log_80_h3_G_H: +ASM_TYPE_DIRECTIVE(Constants_log_80_h3_G_H,@object) +// h3 IEEE double extended, H3 and G3 IEEE single +data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 +data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400 +data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 +data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 +data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00 +data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 +data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 +data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 +data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 +data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 +data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 +data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420 +data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 +data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 +data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 +data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 +data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 +data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 +data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 +data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 +data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 +data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 +data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 +data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488 +data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 +data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 +data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 +data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 +data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 +data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 +data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 +data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 +data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here +data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D +data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101 +data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED +data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 +data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 +data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 +data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D +ASM_SIZE_DIRECTIVE(Constants_log_80_h3_G_H) + +.align 64 +Constant_half: +ASM_TYPE_DIRECTIVE(Constant_half,@object) +data4 0x00000000,0x80000000,0x00003FFE +ASM_SIZE_DIRECTIVE(Constant_half) + +GR_Expo_Range = r32 +GR_Flag = r33 +GR_Table_Ptr = r34 + +GR_Table_Ptr1 = r35 +GR_BIAS = r35 + +GR_Index1 = r36 +GR_sign_mask = r36 + +GR_Index2 = r37 +GR_Expo_X = r37 + +GR_signif_Z = r38 +GR_M = r38 + +GR_X_0 = r39 +GR_Mask = r39 + +GR_X_1 = r40 +GR_W1_ptr = r40 + +GR_W2_ptr = r41 +GR_X_2 = r41 + +GR_Z_1 = r42 +GR_M2 = r42 + +GR_M1 = r43 +GR_Z_2 = r43 + +GR_N = r44 +GR_k = r44 + +GR_Big_Pos_Exp = r45 + + +GR_BIAS_p_k = r47 +GR_BIASed_exp_y = r47 + +GR_Big_Neg_Exp = r48 +GR_Index3 = r48 +GR_temp = r48 + +GR_vsm_expo = r49 +GR_y_sign = r49 + +GR_T1_ptr = r50 +GR_T2_ptr = r51 +GR_N_fix = r52 +GR_exp_y = r53 +GR_signif_y = r54 +GR_exp_and_sign_y = r55 +GR_low_order_bit = r56 +GR_get_exp_mask = r57 +GR_exponent_zero = r58 + +// ** Registers for unwind support + +GR_SAVE_PFS = r59 +GR_SAVE_B0 = r60 +GR_SAVE_GP = r61 +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 + +FR_X = f8 +FR_Y = f9 +FR_RESULT = f99 + +// ** + +FR_Input_X = f8 +FR_Output = f8 +FR_Input_Y = f9 + +FR_Neg = f10 +FR_P_hi = f10 +FR_X = f10 + +FR_Half = f11 +FR_h_3 = f11 +FR_poly_hi = f11 + +FR_Sgn = f12 + +FR_Neg_X = f13 +FR_half_W = f13 + +FR_X_cor = f14 +FR_P_lo = f14 + +FR_W = f15 + +FR_X_lo = f32 + +FR_S = f33 +FR_W3 = f33 + +FR_Y_hi = f34 +FR_logx_hi = f34 + +FR_Z = f35 +FR_logx_lo = f35 +FR_GS_hi = f35 +FR_Y_lo = f35 + +FR_r_cor = f36 +FR_Scale = f36 + +FR_G_1 = f37 +FR_G = f37 +FR_Wsq = f37 +FR_L_Inv = f37 +FR_temp = f37 + +FR_H_1 = f38 +FR_H = f38 +FR_W4 = f38 +FR_float_N = f38 + +FR_h = f39 +FR_h_1 = f39 +FR_N = f39 +FR_P_7 = f39 + +FR_G_2 = f40 +FR_P_8 = f40 +FR_L_hi = f40 + +FR_H_2 = f41 +FR_L_lo = f41 +FR_A_1 = f41 + +FR_h_2 = f42 +FR_P_6 = f42 + +FR_abs_W = f43 +FR_W1 = f43 + +FR_G_3 = f44 +FR_P_8 = f44 +FR_T1 = f44 + +FR_log2_hi = f45 +FR_W2 = f45 + +FR_GS_lo = f46 +FR_T2 = f46 + +FR_W_1_p1 = f47 +FR_H_3 = f47 + +FR_float_N = f48 + +FR_P_4 = f49 +FR_A_2 = f49 + +FR_Q_4 = f50 +FR_r4 = f50 + +FR_Q_3 = f51 +FR_A_3 = f51 + +FR_Q_2 = f52 +FR_P_2 = f52 + +FR_Q_1 = f53 +FR_P_1 = f53 +FR_T = f53 + +FR_Wp1 = f54 +FR_Q_5 = f54 +FR_P_3 = f54 + +FR_Q_6 = f55 + +FR_log2_lo = f56 +FR_Two = f56 + +FR_Big = f57 + +FR_neg_2_mK = f58 +FR_NBig = f58 + +FR_r = f59 + +FR_poly_lo = f60 + +FR_poly = f61 + +FR_P_5 = f62 + +FR_rsq = f63 + +FR_Result = f99 +FR_Result_small = f100 +FR_Result_big = f101 + +.section .text +.proc powl# +.global powl# +.align 64 + +powl: +{ .mfi +alloc GR_Expo_Range = ar.pfs,0,30,4,0 +(p0) fclass.m.unc p7, p13 = FR_Input_Y, 0x1E7 +nop.i 0 +} +{ .mfi +(p0) getf.exp GR_exp_and_sign_y = FR_Input_Y +// +// Save State +// +(p0) fclass.m.unc p6, p12 = FR_Input_X, 0x1E7 +nop.i 0 +};; +{ .mfi +(p0) getf.sig GR_signif_y = FR_Input_Y +(p0) fcmp.eq.unc.s1 p12, p13 = FR_Input_X, f1 +nop.i 0 +} +{ .mfi + nop.m 999 +// +// Check for y = 1 +// Identify EM unsupporteds. +// Load FR_half = .5 +// +(p0) fadd.s1 FR_Two = f1, f1 +// +// Load 1/2 in GP register +// +nop.i 0 +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constant_half#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + +{ .mlx +(p0) ldfe FR_Half =[GR_Table_Ptr],0 +(p0) movl GR_get_exp_mask = 0x1FFFF ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p9, p15 = FR_Input_Y, 0x1FF +// +// Create FR_Two = 2 +// Get exp and significand of Y +// Crate Masks +// sgn = 1 +// +(p0) and GR_exp_y = GR_get_exp_mask,GR_exp_and_sign_y +} +{ .mlx + nop.m 999 +(p0) movl GR_exponent_zero = 0xFFFF ;; +} +{ .mfi + nop.m 999 +(p0) mov FR_Sgn = f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p10, p11 = FR_Input_Y, f1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Load Half +// +(p0) fclass.nm.unc p8, p14 = FR_Input_X, 0x1FF +// +// Remove sign bit from exponent of y. +// Check for x = 1 +// +(p6) br.cond.spnt L(POWL_64_SPECIAL) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.spnt L(POWL_64_SPECIAL) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(POWL_64_UNSUPPORT) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt L(POWL_64_UNSUPPORT) ;; +} +{ .mfi +(p0) cmp.lt.unc p9, p0 = GR_exp_y,GR_exponent_zero +(p0) fcmp.lt.unc.s1 p6, p13 = FR_Input_X, f0 +// +// Branch on Infs, Nans, Zeros, and Natvals +// Check to see that exponent < 0 +// +(p0) sub GR_exp_y = GR_exp_y,GR_exponent_zero +} +// x not zero, is y ==2? +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p7, p14 = FR_Input_Y, FR_Two + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 +(p7) br.cond.spnt L(POWL_64_SQUARE) ;; // Branch if x not zero and y=2 +} +{ .mfi + nop.m 999 +(p6) fmerge.ns FR_Neg_X = FR_Input_X, FR_Input_X + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f1 +// +// For y = 1, compute result = x +// For x = 1, compute 1 +// When Y is one return X and possible raise +// denormal operand exception. +// Remove exponent BIAS +// +(p6) shl GR_exp_and_sign_y= GR_signif_y,GR_exp_y ;; +} +{ .mfi +(p9) or GR_exp_and_sign_y = 0xF,GR_signif_y +(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 + nop.i 999 ;; +} +{ .mii + nop.m 999 +(p6) extr.u GR_exp_y = GR_exp_and_sign_y,63,1 ;; +(p6) cmp.ne.unc p9, p0 = GR_exp_y, r0 +} +{ .mii + nop.m 999 +// +// Both predicates can be set. +// Don't consider y's < 1. +// +(p6) shl GR_signif_y= GR_exp_and_sign_y,1 ;; +// +// Is shift off integer part of y. +// Get y's even or odd bit. +// +(p6) cmp.ne.unc p8, p0 = GR_signif_y, r0 +} +{ .mib + nop.m 999 + nop.i 999 +// +// Is the fractional part of the y = 0? +// Is the integer even or odd. +// +(p10) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(POWL_64_XNEG) ;; +} +{ .mfi + nop.m 999 +(p9) fmerge.ns FR_Sgn = FR_Sgn, FR_Sgn + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s0 p11, p0 = FR_Input_Y, FR_Half + nop.i 999 ;; +} +// +// Raise possible denormal operand exception for both +// X and Y. +// +{ .mfb + nop.m 999 +// +// Branch for (x < 0) and Y not an integer. +// +(p0) fcmp.eq.unc.s0 p12, p0 = FR_Input_X, f1 +// +// For x < 0 and y integer, make x positive +// For x < 0 and y odd integer,, set sign = -1. +// +(p11) br.cond.spnt L(POWL_64_SQRT) ;; +} +{ .mmf +(p0) cmp.eq.unc p15, p14 = r0, r0 + nop.m 999 +(p13) fnorm.s1 FR_Z = FR_Input_X ;; +} +{ .mfi + nop.m 999 +(p6) fnorm.s1 FR_Z = FR_Neg_X + nop.i 999 +} +;; + +// +// Branch to embedded sqrt(x) +// +// +// Computes ln( x ) to extra precision +// Input FR 1: FR_X +// Output FR 2: FR_Y_hi +// Output FR 3: FR_Y_lo +// Output PR 1: PR_Safe +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h1#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + + +{ .mlx + nop.m 999 +(p0) movl GR_BIAS = 0x000000000000FFFF ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_W = FR_Z, f1 + nop.i 999 ;; +} +// +// Z = Norm(X) - both + and - case +// Set Safe = True +// +{ .mmb +(p0) getf.sig GR_signif_Z = FR_Z +(p0) getf.exp GR_N = FR_Z + nop.b 999 ;; +} +{ .mii + nop.m 999 +// +// Get significand of Z +// W = Z - 1 +// +(p0) extr.u GR_Index1 = GR_signif_Z, 59, 4 ;; +// +// Index1 = High order 4 bits of Z +// X_0 = High order 15 bit of Z +// +(p0) shl GR_Index1 = GR_Index1,5 ;; +} +{ .mfi + nop.m 999 +// +// Add offset to Index1 ptr. +// +(p0) fabs FR_abs_W = FR_W +// +// BIAS = 0x000...FFFF +// Adjust Index1 ptr ( x 32) . +// +(p0) add GR_Index1 = GR_Index1,GR_Table_Ptr +} +{ .mmi + nop.m 999 ;; +(p0) ld2 GR_Z_1 =[GR_Index1],4 +(p0) extr.u GR_X_0 = GR_signif_Z, 49, 15 +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h2#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi +(p0) ldfs FR_G_1 = [GR_Index1],4 ;; +(p0) ldfs FR_H_1 = [GR_Index1],8 + nop.i 999 ;; +} +// +// Adjust Index2 (x 32). +// +{ .mfi +(p0) ldfe FR_h_1 = [GR_Index1],0 + nop.f 999 +(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 ;; +} +{ .mmi + nop.m 999 ;; +// +// load Z_1 from Index1 +// abs_W = |W| +// Point to Table2 +// +(p0) getf.exp GR_M = FR_abs_W +// +// M = M - BIAS +// Load G_1 +// N = exponent of Z +// + nop.i 999;; +} +{ .mmi + nop.m 999 + nop.m 999 + nop.i 999;; +} +{ .mmi + nop.m 999 + nop.m 999 + nop.i 999;; +} +{ .mmi + nop.m 999 + nop.m 999 +(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +} +{ .mii + nop.m 999 +// +// Extract Index2 +// Load H_1 +// Is -8 > M ? +// +(p0) shl GR_Index2=GR_Index2,5 ;; +(p0) add GR_Index2 = GR_Index2, GR_Table_Ptr +} +// +// M = exponent of abs_W +// X_1 = X_0 * Z_1 +// +{ .mii +(p0) sub GR_M = GR_M, GR_BIAS + nop.i 999 ;; +(p0) cmp.gt.unc p7, p14 = -8, GR_M +} +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.spnt L(LOGL80_NEAR) ;; +} +// +// Load h_1 +// Possible branch out. +// Add offset of table to Index2 +// +{ .mfi +(p0) ld2 GR_Z_2 =[GR_Index2],4 +(p0) fmerge.se FR_S = f1,FR_Z +(p0) sub GR_N = GR_N, GR_BIAS +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_h3_G_H#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + +// +// load Z_2 +// N - BIAS +// Point to Table 3. +// S = merging of Z and 1.0 +// +{ .mmi +(p0) ldfs FR_G_2 = [GR_Index2],4 +(p0) setf.sig FR_float_N = GR_N +(p0) add GR_Table_Ptr1 = 0x200,GR_Table_Ptr ;; +} +// +// load G_2 +// X_2 = X_1 * Z_2 +// Add offset to Table 2 ptr. +// float_N = significand of N +// +{ .mmi +(p0) ldfs FR_H_2 = [GR_Index2],8 ;; +// +// load H_2 +// G = G * G_2 +// +(p0) ldfe FR_h_2 = [GR_Index2],0 +(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; +} +{ .mmi + nop.m 999 + nop.m 999 + nop.i 999;; +} +{ .mmi + nop.m 999 + nop.m 999 + nop.i 999;; +} +{ .mmi + nop.m 999 + nop.m 999 + nop.i 999;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +} +{ .mfi +(p0) shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 + nop.f 999 +// +// h = h_1 + h_2 +// Adjust Index3 +// +(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr ;; +} +{ .mmb + nop.m 999 +(p0) ldfe FR_h_3 = [GR_Index3],12 + nop.b 999 ;; +} +{ .mmf +(p0) ldfs FR_H_3 = [GR_Table_Ptr1],0 +// +// float_N = Make N a fp number +// Load h_3 +// Get pointer to Q table. +// +(p0) ldfs FR_G_3 = [GR_Index3],0 +(p0) fmpy.s1 FR_G = FR_G_1, FR_G_2 +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Q#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + + + +{ .mfi +(p0) ldfe FR_log2_hi = [GR_Table_Ptr],16 +(p0) fadd.s1 FR_H = FR_H_1, FR_H_2 + nop.i 999 ;; +} +{ .mmf + nop.m 999 +// +// G = G_1 * G_2 * G_3 +// +(p0) ldfe FR_log2_lo = [GR_Table_Ptr],16 +// +// load h_2 +// H = H_1 + H_2 +// Get Index3 +// +(p0) fadd.s1 FR_h = FR_h_1, FR_h_2 ;; +} +// +// Load log2_lo part +// r = G*S -1 +// +{ .mfi +(p0) ldfe FR_Q_6 = [GR_Table_Ptr],16 +// +// Load H_3 +// +(p0) fcvt.xf FR_float_N = FR_float_N + nop.i 999 ;; +} +// +// Load Q_6 +// +{ .mmi +(p0) ldfe FR_Q_5 = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_Q_4 = [GR_Table_Ptr],16 + nop.i 999 ;; +} +{ .mmi +(p0) ldfe FR_Q_3 = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_Q_2 = [GR_Table_Ptr],16 + nop.i 999 ;; +} +{ .mmf + nop.m 999 +// +// poly_lo = Q_5 + r * Q_6 +// Load Q_2 +// rsq = r * r +// +(p0) ldfe FR_Q_1 = [GR_Table_Ptr],16 +// +// h = h_1 + h_2 + h_3 +// H = H_1 + H_2 + H_3 +// Load G_3. +// Begin Loading Q's - load log2_hi part +// +(p0) fmpy.s1 FR_G = FR_G, FR_G_3 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_H = FR_H, FR_H_3 + nop.i 999 +} +;; + +// +// Y_lo = poly + Y_lo +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_h = FR_h, FR_h_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load Q_5 +// +(p0) fmpy.s1 FR_GS_hi = FR_G, FR_S + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fms.s1 FR_r = FR_G, FR_S, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// GS_hi = G*S +// Load Q_4 +// +(p0) fsub.s1 FR_r_cor = FR_GS_hi, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load Q_3 +// r_cor = GS_hi -1 +// GS_lo = G*S - GS_hi +// +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = poly_hi + rsq * poly_lo +// Tbl = float_N*log2_hi + H +// +(p0) fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// r_cor = r_cor - r +// poly_hi = r * Q_2 + Q_1 +// +(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Load Q_1 +// +(p0) fsub.s1 FR_r_cor = FR_r_cor, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Y_lo = float_N*log2_lo + h +// +(p0) fadd.s1 FR_Y_hi = FR_G, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = Q_4 + r * poly_lo;; +// r_cor = r_cor + GS_lo;; +// +(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo + nop.i 999 +} +{ .mfi + nop.m 999 +// +// poly_lo = Q_3 + r * poly_lo;; +// +(p0) fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_Y_lo = FR_G, FR_Y_hi + nop.i 999 +} +{ .mmi +(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 + nop.i 999 ;; +} +{ .mfi +(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Y_hi = Tbl + r +// r_cor = r_cor + Y_lo +// +(p0) fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// Y_lo = Tbl - Y_hi +// poly = rsq * poly + r_cor +// +(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_r + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Y_lo = Y_lo + r +// +(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly +// +// Load L_Inv +// Load L_hi +// Load L_lo +// all long before they are needed. +// They are used in LOGL_RETURN PATH +// +br.cond.sptk L(LOGL_RETURN) ;; +} +L(LOGL80_NEAR): +// +// Branch LOGL80_NEAR +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_P#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_Wsq = FR_W, FR_W +(p0) add GR_Table_Ptr1 = 0x50,GR_Table_Ptr +} +// +// Adjust ptr to 1/2 +// Adjust Ptr1 to P_4 +// +{ .mmi +(p0) ldfe FR_Half = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 + nop.i 999 +} +// +// Load 1/2 +// +{ .mmi +(p0) ldfe FR_P_8 = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 + nop.i 999 +} +{ .mmi +(p0) ldfe FR_P_7 = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 + nop.i 999 +} +// +// Load P_7 +// half_W = .5 * W +// Load P_3 +// +{ .mmi +(p0) ldfe FR_P_6 = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_P_1 = [GR_Table_Ptr1],16 + nop.i 999 ;; +} +// +// Load P_6 +// Wsq = w * w +// poly = w*P_4 + P_3 +// Load P_2 +// +{ .mfi +(p0) ldfe FR_P_5 = [GR_Table_Ptr],16 +// +// Load P_5 +// poly_lo = w * P_8 + P_7 +// Y_hi = w - (1/2)w*w +// Load P_1 +// +(p0) fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_W3 = FR_Wsq, FR_W + nop.i 999 +} +;; + +// +// Y_lo = W3 * poly + Y_lo +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi +(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; +(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 + nop.i 999 ;; +} +{ .mfi +(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 +// +// Load P_8 +// Load P_4 +// +(p0) fmpy.s1 FR_half_W = FR_Half, FR_W + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// W4 = Wsq * Wsq +// poly = w *poly + P_2 +// +(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_Y_lo = FR_W, FR_Y_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = w * poly + P_1 +// w3 = wsq * w +// +(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// poly_lo = w * poly_lo + P_6 +// Y_lo = W - Y_hi +// +(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = w * poly_lo + +// Y_lo = Y_lo - w * (1/2)w +// +(p0) fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Y_lo = (W-Y_hi) - w * (1/2)w +// poly = W4* poly_lo + poly +// +(p0) fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo + nop.i 999 ;; +} +L(LOGL_RETURN): +{ .mfi +(p0) add GR_Expo_Range = 0x2,r0 +// +// Load L_Inv +// Load L_hi +// Load L_lo +// all long before they are needed. +// +// +// kernel_log_80 computed ln(X) +// and return logX_hi and logX_lo as results. +// PR_pow_Safe set as well. +// +(p0) fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo +// +// Compute Y * (logX_hi + logX_lo) +// P_hi -> X +// P_lo -> X_cor +// (Manipulate names so that inputs are in +// the place kernel_exp expects them) +// Set GR_Flag to 2 +// Set GR_Expo_Range to Double +// +// This function computes exp( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe +// +(p0) cmp.eq.unc p15, p0 = r0, r0 +} +;; + +{ .mmi +(p0) addl GR_W1_ptr = @ltoff(Constants_exp_64_W1#), gp +(p0) addl GR_W2_ptr = @ltoff(Constants_exp_64_W2#), gp +(p0) add GR_Flag = 0x2,r0 +} +;; + +{ .mmi + ld8 GR_W1_ptr = [GR_W1_ptr] + ld8 GR_W2_ptr = [GR_W2_ptr] +(p0) cmp.ne.unc p7, p0 = 0x1, GR_Flag +} +;; + +{ .mlx + nop.m 999 +(p0) movl GR_Mask = 0x1FFFF ;; +} + + +{ .mlx + nop.m 999 +(p0) movl GR_BIAS = 0x0FFFF ;; +} +{ .mfi + nop.m 999 +// +// X_lo = Y * logX_lo +// +(p0) fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Set Safe=True +// Flag is always 2 for this routine +// +(p0) fmpy.s1 FR_float_N = FR_X, FR_L_Inv + nop.i 999 +} +{ .mfi + nop.m 999 +// +// X_hi = Y * logX_hi + X_lo +// Set GR_Flag = 2 for exp(x + xcor) +// +(p0) fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi + nop.i 999 ;; +} +{ .mmi + nop.m 999 ;; +(p0) getf.exp GR_Expo_X = FR_X + nop.i 999 ;; +} +{ .mfi +(p0) and GR_Expo_X = GR_Expo_X, GR_Mask +// +// Calculate unBIASed exponent of X +// Point to Table of W1s +// Point to Table of W2s +// +(p0) fcvt.fx.s1 FR_N = FR_float_N + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo +// +// Float_N = X * L_Inv +// Create exponent BIAS +// Get BIASed exponent of X +// +(p0) sub GR_Expo_X = GR_Expo_X, GR_BIAS ;; +} +{ .mib +(p0) cmp.gt.unc p9, p0 = -6, GR_Expo_X + nop.i 999 +// +// N = fcvt.fx(float_N) +// If -6 > Expo_X, set P9 +// +(p9) br.cond.spnt L(EXPL_SMALL) +} +;; + +// +// If expo_X < -6 goto exp_small +// +{ .mmi + nop.m 999 +(p0) addl GR_T1_ptr = @ltoff(Constants_exp_64_T1#), gp +(p0) cmp.lt.unc p10, p0 = 14, GR_Expo_X +} +;; + +{ .mmi + ld8 GR_T1_ptr = [GR_T1_ptr] + nop.m 999 + nop.i 999 +} +;; + +{ .mib + nop.m 999 + nop.i 999 +// +// If 14 < Expo_X, set P10 +// Create pointer to T1 table +// +(p10) br.cond.spnt L(EXPL_HUGE) ;; +} + + +{ .mmi +(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp +(p0) addl GR_T2_ptr = @ltoff(Constants_exp_64_T2#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + ld8 GR_T2_ptr = [GR_T2_ptr] + nop.i 999 +} +;; + + +{ .mmi +(p0) shladd GR_Table_Ptr = GR_Expo_Range,4,GR_Table_Ptr ;; +// +// Adjust T1_ptr by x 4 for single-precision values +// Adjust T2_ptr by x 4 for single-precision values +// +(p0) ld8 GR_Big_Pos_Exp = [GR_Table_Ptr],8 + nop.i 999 ;; +} +// +// Load double W1 +// Load +max exponent +// +{ .mfi +(p0) ld8 GR_Big_Neg_Exp = [GR_Table_Ptr],0 +// +// If 14 < Expo_X, goto exp_huge +// +(p0) fcvt.xf FR_float_N = FR_N + nop.i 999 +} +;; + +// +// Load double W2 +// Load -max exponent +// Load ptr to A's +// + +{ .mmi +(p0) getf.sig GR_N_fix = FR_N +(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_A#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] + nop.m 999 + nop.i 999 +} +;; + +// +// Load single T1 +// Load single T2 +// W_1_p1 = W_1 + 1 +// +{ .mmi +(p0) ldfe FR_A_3 = [GR_Table_Ptr],16 ;; +// +// Load A_3 +// if k > big_pos_exp, set p14 and Safe=False +// +(p0) ldfe FR_A_2 = [GR_Table_Ptr],16 +(p0) extr.u GR_M1 = GR_N_fix, 6, 6 +} +{ .mmi + nop.m 999 ;; +(p0) shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr +// +// float_N = fcvt.xf(N) +// N_fix = significand of N +// Create pointer to T2 table +// +(p0) extr.u GR_M2 = GR_N_fix, 0, 6 +} +// +// r = r + X_cor +// Adjust W1_ptr by x 8 for double-precision values +// Adjust W2_ptr by x 8 for double-precision values +// Adjust Table_ptr by Expo_Rangex16 +// +{ .mmi +(p0) shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr ;; +(p0) ldfd FR_W1 = [GR_W1_ptr],0 +(p0) shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr +} +// +// Load ptr to A's +// +{ .mfi +(p0) ldfs FR_T1 = [GR_T1_ptr],0 +(p0) fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X +(p0) shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr ;; +} +{ .mmi +(p0) ldfd FR_W2 = [GR_W2_ptr],0 +(p0) ldfs FR_T2 = [GR_T2_ptr],0 +// +// r = x - L_hi * float_N +// M2 = extr.u(N_fix,0,6) +// M1 = extr.u(N_fix,6,6) +// +(p0) extr GR_k = GR_N_fix, 12, 52 ;; +} +// +// Load A_1 +// poly = A_3 * r + A_2 +// rsq = r*r +// +{ .mii +(p0) add GR_BIAS_p_k = GR_BIAS, GR_k +(p0) cmp.gt.unc p14,p15 = GR_k,GR_Big_Pos_Exp ;; +(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp +} +// +// BIAS_p_K = BIAS + k +// T = T1 * T2 +// +{ .mfi +(p0) setf.exp FR_Scale = GR_BIAS_p_k + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r + nop.i 999 +} +// +// W = W_1_p1 * W2 + W1 +// +{ .mfi +(p0) ldfe FR_A_1 = [GR_Table_Ptr],16 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_W_1_p1 = FR_W1, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// k = extr.u(N_fix,0,6) +// r = r - N * L_lo +// Load ptr to Table of exponent thresholds. +// +(p0) fadd.s1 FR_r = FR_r, FR_X_cor + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_T = FR_T1, FR_T2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if k < big_neg_exp, set p14 and Safe=False +// Load A_2 +// +(p0) fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) mov FR_Y_hi = FR_T + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Scale = set_exp(BIAS_p_k) +// poly = r * poly + A_1 +// +(p0) fadd.s1 FR_Wp1 = FR_W, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly = FR_rsq, FR_poly,FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Wp1 = W + 1 +// poly = rsq * poly + rk +// +(p0) fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Y_lo = poly * Wp1 + W +// Y_hi = T +// +(p0) fmpy.s1 FR_Y_lo = FR_Y_lo, FR_T +// +// Y_lo = T * Y_lo +// +(p0) br.cond.sptk L(EXPL_RETURN) ;; +} + +L(EXPL_SMALL): + +// +// r4 = rsq * rsq +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr1 = @ltoff(Constants_exp_64_P), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr1 = [GR_Table_Ptr1] + nop.m 999 + nop.i 999 +} +;; + +{ .mmf + nop.m 999 +(p0) ldfe FR_P_6 = [GR_Table_Ptr1],16 +// +// Return +// +(p0) fadd.s1 FR_r = FR_X,f0 ;; +} + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Ptr = [GR_Table_Ptr] +(p0) ldfe FR_P_5 = [GR_Table_Ptr1],16 + nop.i 999 +} +;; + +// +// Is input very small? +// Load P_5 +// +{ .mii +(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 +(p0) add GR_Table_Ptr = 0x040,GR_Table_Ptr ;; +(p0) shladd GR_Table_Ptr = GR_Expo_Range,3,GR_Table_Ptr ;; +} +{ .mmb +(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 +// +// Adjust ptr. +// +(p0) ld8 GR_vsm_expo = [GR_Table_Ptr],0 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +// +// r = X (don't seem to need X_Cor) +// Load the threshold exponents +// +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 ;; +} +// +// Load the negative integer +// Load P_5 +// +{ .mfi +(p0) cmp.lt.unc p12, p0 = GR_Expo_X, GR_vsm_expo + nop.f 999 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// rsq = r * r +// Offset into exponents +// +(p0) fmpy.s1 FR_r4 = FR_rsq, FR_rsq +(p12) br.cond.spnt L(EXPL_VERY_SMALL) ;; +} +{ .mfi +(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 +// +// Load p4,p3,p2,p1 +// +(p0) fma.s1 FR_poly_lo = FR_P_6, FR_r, FR_P_5 +// +// Y_lo = r4 * poly_lo + poly_hi +// Scale = 1.0 +// +(p0) add GR_temp = 0x1,r0 ;; +} +{ .mmf + nop.m 999 +(p0) ldfe FR_P_1 = [GR_Table_Ptr1],0 +(p0) mov FR_Scale = f1 +} +// +// Begin creating lsb to perturb final result +// +{ .mfi +(p0) setf.sig FR_temp = GR_temp +(p0) mov FR_Y_hi = f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = p_5 + p_6 * r +// poly_hi = p_1 + p_2 * r +// +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = p_4 + poly_lo * r +// poly_hi = r + poly_hi * rsq +// +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_hi = FR_P_2, FR_r, FR_P_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = p_3 + poly_lo * r +// Y_hi = 1, always +// +(p0) fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Set lsb in fp register +// +(p0) for FR_temp = FR_Y_lo,FR_temp + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Toggle on last bit of Y_lo +// +(p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_temp +// +// Set lsb of Y_lo to 1 +// +(p0) br.cond.sptk L(EXPL_RETURN) ;; +} +L(EXPL_VERY_SMALL): +{ .mfi + nop.m 999 +(p0) mov FR_Y_lo = FR_r +(p0) cmp.eq.unc p15, p0 = r0, r0 +} +{ .mfi + nop.m 999 +(p0) mov FR_Scale = f1 + nop.i 999 +};; +{ .mfb + nop.m 999 +(p0) mov FR_Y_hi = f1 +// +// If flag_not_1, +// Y_hi = 1.0 +// Y_lo = X + X_cor +// PR_Safe = true +// +(p0) br.cond.sptk L(EXPL_RETURN) ;; +} +L(EXPL_HUGE): +{ .mfi + nop.m 999 +// +// Return for flag=2 +// +(p0) fcmp.gt.unc.s1 p12, p13 = FR_X, f0 +(p0) cmp.eq.unc p14, p15 = r0, r0 ;; +} +{ .mlx + nop.m 999 +// +// Set Safe to false +// Is x > 0 +// +(p12) movl GR_Mask = 0x15DC0 ;; +} +{ .mlx +(p12) setf.exp FR_Y_hi = GR_Mask +(p13) movl GR_Mask = 0xA240 ;; +} +{ .mlx +(p13) setf.exp FR_Y_hi = GR_Mask +// +// x > 0: Create mask for Y_hi = 2**(24,000) +// x <= 0: Create mask for Y_hi = 2**(-24,000) +// +(p13) movl GR_temp = 0xA1DC ;; +} +{ .mfi +(p13) setf.exp FR_Y_lo = GR_temp +// +// x < =0: Create mask for 2**(-24,100) +// x <= 0: Y_lo = w**(-24,100) +// +(p12) mov FR_Y_lo = f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) mov FR_Scale = FR_Y_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// x > 0: Y_lo = 1.0 +// x > 0: Scale = 2**(24,000) +// +(p13) mov FR_Scale = FR_Y_hi + nop.i 999 ;; +} +L(EXPL_RETURN): +{ .mfi + nop.m 999 +// +// Scale = 2**(24,000) +// +// +// exp(y *ln(x)) almost complete +// FR_Scale is Scale +// f34 is Z_hi +// f35 is Z_lo +// +(p0) fmpy.s1 FR_Sgn = FR_Scale, FR_Sgn + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// sgn * scale +// +(p0) fmpy.s1 FR_Y_lo = FR_Y_lo,FR_Sgn + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Z_lo * (sgn * scale) +// +(p0) fma.s0 FR_Result = FR_Y_hi, FR_Sgn, FR_Y_lo +// +// Z_hi * (sgn * scale) + Z_lo +// +(p15) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 +} +{ .mlx + nop.m 999 +// +// Z_hi * (sgn * scale) + Z_lo with wre & td +// Z_hi * (sgn * scale) + Z_lo with fz & td +// +(p0) movl GR_T1_ptr = 0x00000000013FFF ;; +} +{ .mfi + nop.m 999 +(p0) fma.s3 FR_Result_small = FR_Y_hi, FR_Sgn, FR_Y_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Return if no danger of over of underflow. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} +{ .mfi + nop.m 999 +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +(p0) fma.s2 FR_Result_big = FR_Y_hi, FR_Sgn, FR_Y_lo + nop.i 999 ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// +{ .mfi +(p0) setf.exp FR_Big = GR_T1_ptr +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmerge.ns FR_NBig = FR_Big, FR_Big + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// Identify denormals +// +(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// fcmp: resultS2 <= - overflow threshold +// fclass: resultS3 is denorm/unorm/0 +// +(p8) mov GR_Parameter_TAG = 18 ;; +} +{ .mfb + nop.m 999 +// +// fcmp: resultS2 >= + overflow threshold +// +(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig +(p8) br.cond.spnt __libm_error_region ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov GR_Parameter_TAG = 18 +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt __libm_error_region ;; +} +// +// Report that pow overflowed - either +Inf, or -Inf +// +{ .mmb +(p11) mov GR_Parameter_TAG = 19 + nop.m 999 +(p11) br.cond.spnt __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Report that pow underflowed +// +(p0) br.cond.sptk L(POWL_64_RETURN) ;; +} + + +L(POWL_64_SQUARE): +// Here if x not zero and y=2. +// Must call __libm_error_support for overflow or underflow +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fma.s0 FR_Result = FR_Input_X, FR_Input_X, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_T1_ptr = 0x00000000013FFF ;; +} +{ .mfi + nop.m 999 +(p0) fma.s3 FR_Result_small = FR_Input_X, FR_Input_X, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Return if no danger of over of underflow. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} +{ .mfi + nop.m 999 +(p0) fma.s2 FR_Result_big = FR_Input_X, FR_Input_X, f0 + nop.i 999 ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// +{ .mfi +(p0) setf.exp FR_Big = GR_T1_ptr +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmerge.ns FR_NBig = FR_Big, FR_Big + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// Identify denormals +// +(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// fcmp: resultS2 <= - overflow threshold +// fclass: resultS3 is denorm/unorm/0 +// +(p8) mov GR_Parameter_TAG = 18 ;; +} +{ .mfb + nop.m 999 +// +// fcmp: resultS2 >= + overflow threshold +// +(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig +(p8) br.cond.spnt __libm_error_region ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov GR_Parameter_TAG = 18 +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt __libm_error_region ;; +} +// +// Report that pow overflowed - either +Inf, or -Inf +// +{ .mmb +(p11) mov GR_Parameter_TAG = 19 + nop.m 999 +(p11) br.cond.spnt __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Report that pow underflowed +// +(p0) br.cond.sptk L(POWL_64_RETURN) ;; +} + + + + +L(POWL_64_SPECIAL): +{ .mfi + nop.m 999 +(p0) fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Is x=+1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p14, p0 = FR_Input_Y, 0x023 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN + nop.i 999 +} +{ .mfb + nop.m 999 +(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 +(p15) br.cond.spnt L(POWL_64_RETURN) ;; // Exit if x=1 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p13, p0 = FR_Input_X, 0x023 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8, p0 = FR_Input_X, 0x143 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x143 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x083 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = FR_Input_Y, 0x083 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = FR_Input_Y, 0x007 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p7, p0 = FR_Input_Y, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// set p13 if x +/- Inf +// set p14 if y +/- Inf +// set p8 if x Natval or +/-SNaN +// set p9 if y Natval or +/-SNaN +// set p10 if x QNaN +// set p11 if y QNaNs +// set p6 if y is +/-0 +// set p7 if y is 1 +// +(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X +(p6) cmp.ne p8,p0 = r0,r0 ;; // Don't exit if x=snan, y=0 ==> result=+1 +} +{ .mfb + nop.m 999 +(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X +(p8) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mfb + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f0 +(p9) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mfi + nop.m 999 +// +// Produce result for SNaN and NatVals and return +// +(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// If Y +/- 0, set p15 if x +/- 0 +// +(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fadd.s0 FR_Result = f1, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Set p8 if y = +/-0 and X is a QNaN/SNaN +// If y = +/-0, let result = 1.0 +// +(p7) fmpy.s0 FR_Result = FR_Input_X,f1 +// +// If y == 1, result = x * 1 +// +(p15) mov GR_Parameter_TAG = 20 +} +{ .mib + nop.m 999 + nop.i 999 +(p15) br.cond.spnt __libm_error_region ;; +} +{ .mib + nop.m 999 +// +// If x and y are both zero, result = 1.0 and call error +// support. +// +(p8) mov GR_Parameter_TAG = 23 +(p8) br.cond.spnt __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// If y = +/-0 and x is a QNaN, result = 1.0 and call error +// support. +// +(p6) br.cond.spnt L(POWL_64_RETURN) ;; +} + +// If x=0, y=-inf, go to the X_IS_ZERO path +{ .mfb + nop.m 999 +(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 +(p7) br.cond.spnt L(POWL_64_RETURN) ;; +} + +{ .mfi + nop.m 999 +// +// Produce all results for x**0 and x**1 +// Let all the result x ** 0 == 1 and return +// Let all x ** 1 == x and return +// +(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X +(p10) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p11) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Return result for x or y QNaN input with QNaN result +// +(p14) br.cond.spnt L(POWL_64_Y_IS_INF) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.spnt L(POWL_64_X_IS_INF) ;; +} +L(POWL_64_X_IS_ZERO): +{ .mmb +(p0) getf.sig GR_signif_y = FR_Input_Y +(p0) getf.exp GR_BIASed_exp_y = FR_Input_Y + nop.b 999 ;; +} +{ .mlx + nop.m 999 +(p0) movl GR_Mask = 0x1FFFF +} +{ .mlx + nop.m 999 +(p0) movl GR_y_sign = 0x20000 ;; +} +// +// Get BIASed exp and significand of y +// +{ .mfi +(p0) and GR_exp_y = GR_Mask,GR_BIASed_exp_y + nop.f 999 +(p0) and GR_y_sign = GR_y_sign,GR_BIASed_exp_y +} +{ .mlx + nop.m 999 +(p0) movl GR_BIAS = 0xFFFF ;; +} +{ .mfi +(p0) cmp.lt.unc p9, p8 = GR_exp_y,GR_BIAS + nop.f 999 +// +// Maybe y is < 1 already, so +// can never be an integer. +// Remove sign bit from exponent. +// +(p0) sub GR_exp_y = GR_exp_y,GR_BIAS ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Remove exponent BIAS +// +(p8) shl GR_exp_y= GR_signif_y,GR_exp_y ;; +} +{ .mfi +(p9) or GR_exp_y= 0xF,GR_signif_y + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 +// +// Shift significand of y looking for nonzero bits +// For y > 1, shift signif_y exp_y bits to the left +// For y < 1, turn on 4 low order bits of significand of y +// so that the fraction will always be non-zero +// +(p0) shl GR_signif_y= GR_exp_y,1 ;; +(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 +} +// +// Integer part of y shifted off. +// Get y's low even or odd bit - y might not be an int. +// +{ .mii +(p0) cmp.eq.unc p13,p0 = GR_signif_y, r0 +(p0) cmp.eq.unc p8,p9 = GR_y_sign, r0 ;; +// +// Is y an int? +// Is y positive +// +(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 ;; +} +// +// Is y and int and odd? +// +{ .mfb +(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 +(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal + nop.b 999 ;; +} +{ .mfb + nop.m 999 +// +// Is y and int and odd and positive? +// +(p13) mov FR_Result = FR_Input_X +(p13) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mfi + nop.m 999 +// +// Return +/-0 when x=+/-0 and y is and odd pos. int +// +(p14) frcpa.s0 FR_Result, p10 = f1, FR_Input_X +(p14) mov GR_Parameter_TAG = 21 +} +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.spnt __libm_error_region ;; +} + +{ .mfb + nop.m 999 +// +// Return +/-0 when x=+/-Inf and y is and odd neg int +// and raise dz exception +// +(p8) mov FR_Result = f0 +(p8) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mfi + nop.m 999 +// +// Return +0 when x=+/-0 and y > 0 and not odd. +// +(p9) frcpa.s0 FR_Result, p10 = f1,f0 +(p9) mov GR_Parameter_TAG = 21 +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.sptk __libm_error_region ;; +} +L(POWL_64_X_IS_INF): +{ .mfi +(p0) getf.exp GR_exp_y = FR_Input_Y +(p0) fclass.m.unc p13, p0 = FR_Input_X,0x022 +(p0) mov GR_Mask = 0x1FFFF ;; +} + +{ .mfi +(p0) getf.sig GR_signif_y = FR_Input_Y +(p0) fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Flag if y denormal + nop.i 999 ;; +} + +// +// Get exp and significand of y +// Create exponent mask and sign mask +// +{ .mlx +(p0) and GR_low_order_bit = GR_Mask,GR_exp_y +(p0) movl GR_BIAS = 0xFFFF +} +{ .mmi + nop.m 999 ;; +// +// Remove sign bit from exponent. +// +(p0) cmp.lt.unc p9, p8 = GR_low_order_bit,GR_BIAS +// +// Maybe y is < 1 already, so +// isn't an int. +// +(p0) sub GR_low_order_bit = GR_low_order_bit,GR_BIAS +} +{ .mlx + nop.m 999 +(p0) movl GR_sign_mask = 0x20000 ;; +} +{ .mfi +(p0) and GR_sign_mask = GR_sign_mask,GR_exp_y +// +// Return +Inf when x=+/-0 and y < 0 and not odd and raise +// divide-by-zero exception. +// +(p0) fclass.m.unc p11, p0 = FR_Input_X,0x021 + nop.i 999 ;; +} +{ .mmi + nop.m 999 ;; +// +// Is shift off integer part of y. +// Get y's even or odd bit - y might not be an int. +// +(p11) cmp.eq.unc p11,p12 = GR_sign_mask, r0 +// +// Remove exponent BIAS +// +(p8) shl GR_exp_y = GR_signif_y,GR_low_order_bit ;; +} +{ .mfi +(p9) or GR_exp_y = 0xF,GR_signif_y +// +// Is y positive or negative when x is +Inf? +// Is y and int when x = -Inf +// +(p11) mov FR_Result = FR_Input_X + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) mov FR_Result = f0 + nop.i 999 ;; +} +{ .mii + nop.m 999 +// +// Shift signficand looking for nonzero bits +// For y non-ints, upset the significand. +// +(p0) shl GR_signif_y = GR_exp_y,1 ;; +(p13) cmp.eq.unc p13,p0 = GR_signif_y, r0 +} +{ .mii + nop.m 999 +(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 ;; +(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 +} +{ .mib + nop.m 999 + nop.i 999 +(p11) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.sptk L(POWL_64_RETURN) ;; +} +// +// Return Inf for y > 0 +// Return +0 for y < 0 +// Is y even or odd? +// +{ .mii +(p13) cmp.eq.unc p13,p10 = GR_sign_mask, r0 +(p0) cmp.eq.unc p8,p9 = GR_sign_mask, r0 ;; + nop.i 999 +} +{ .mfi + nop.m 999 +// +// For x = -inf, y is and int, positive +// and odd +// Is y positive in general? +// +(p13) mov FR_Result = FR_Input_X + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p10) fmerge.ns FR_Result = f0, f0 +(p13) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mfi + nop.m 999 +// +// Return -Inf for x = -inf and y > 0 and odd int. +// Return -0 for x = -inf and y < 0 and odd int. +// +(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p9) mov FR_Result = f0 +(p8) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.sptk L(POWL_64_RETURN) ;; +} +L(POWL_64_Y_IS_INF): +{ .mfi + nop.m 999 +// +// Return Inf for x = -inf and y > 0 not an odd int. +// Return +0 for x = -inf and y < 0 and not an odd int. +// +(p0) fclass.m.unc p8, p0 = FR_Input_Y, 0x021 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x022 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fabs FR_X = FR_Input_X + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Find y = +/- Inf +// Compute |x| +// +(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For y = +Inf and |x| < 1 returns 0 +// For y = +Inf and |x| > 1 returns Inf +// For y = -Inf and |x| < 1 returns Inf +// For y = -Inf and |x| > 1 returns 0 +// +(p6) mov FR_Result = f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) mov FR_Result = FR_Input_Y + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_Y + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p13) mov FR_Result = f0 +// +// Produce x ** +/- Inf results +// +(p6) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.spnt L(POWL_64_RETURN) ;; +} +{ .mfb + nop.m 999 +// +// +/-1 ** +/-Inf, result is +1 +// +(p0) fmpy.s0 FR_Result = f1,f1 +(p0) br.cond.sptk L(POWL_64_RETURN) ;; +} +L(POWL_64_UNSUPPORT): +{ .mfb + nop.m 999 +// +// Return NaN and raise invalid +// +(p0) fmpy.s0 FR_Result = FR_Input_X,f0 +// +// Raise exceptions for specific +// values - pseudo NaN and +// infinities. +// +(p0) br.cond.sptk L(POWL_64_RETURN) ;; +} +L(POWL_64_XNEG): +{ .mfi + nop.m 999 +(p0) frcpa.s0 FR_Result, p8 = f0, f0 +// +// Raise invalid for x < 0 and +// y not an integer and +// +(p0) mov GR_Parameter_TAG = 22 +} +{ .mib + nop.m 999 + nop.i 999 +(p0) br.cond.sptk __libm_error_region ;; +} +L(POWL_64_SQRT): +{ .mfi + nop.m 999 +(p0) frsqrta.s0 FR_Result,p10 = FR_Input_X + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 f62=FR_Half,FR_Input_X,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (2) +// h = 1/2 * a in f9 +// +(p10) fma.s1 f63=FR_Result,FR_Result,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (3) +// t1 = y0 * y0 in f10 +// +(p10) fnma.s1 f32=f63,f62,f11 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (4) +// t2 = 1/2 - t1 * h in f10 +// +(p10) fma.s1 f33=f32,FR_Result,FR_Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (5) +// y1 = y0 + t2 * y0 in f13 +// +(p10) fma.s1 f34=f33,f62,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (6) +// t3 = y1 * h in f10 +// +(p10) fnma.s1 f35=f34,f33,f11 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (7) +// t4 = 1/2 - t3 * y1 in f10 +// +(p10) fma.s1 f63=f35,f33,f33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (8) +// y2 = y1 + t4 * y1 in f13 +// +(p10) fma.s1 f32=FR_Input_X,f63,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Step (9) +// S = a * y2 in f10 +// +(p10) fma.s1 FR_Result=f63,f62,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (10) +// t5 = y2 * h in f9 +// +(p10) fma.s1 f33=f11,f63,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (11) +// H = 1/2 * y2 in f11 +// +(p10) fnma.s1 f34=f32,f32,f8 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Step (12) +// d = a - S * S in f12 +// +(p10) fnma.s1 f35=FR_Result,f63,f11 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (13) +// t6 = 1/2 - t5 * y2 in f7 +// +(p10) fma.s1 f62=f33,f34,f32 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Step (14) +// S1 = S + d * H in f13 +// +(p10) fma.s1 f63=f33,f35,f33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Step (15) +// H1 = H + t6 * h in f7 +// +(p10) fnma.s1 f32=f62,f62,FR_Input_X + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Step (16) +// d1 = a - S1 * S1 +// +(p10) fma.s0 FR_Result=f32,f63,f62 +// +// Step (17) +// R = S1 + d1 * H1 +// +(p10) br.cond.sptk L(POWL_64_RETURN) ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Do the Newton-Raphson iteration from the EAS. +// +(p0) br.cond.sptk L(POWL_64_RETURN) ;; +} +// +// Take care of the degenerate cases. +// + +L(POWL_64_RETURN): +{ .mfb + nop.m 999 +(p0) mov FR_Output = FR_Result +(p0) br.ret.sptk b0 ;; +} +.endp powl +ASM_SIZE_DIRECTIVE(powl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_rem_pio2.c b/sysdeps/ia64/fpu/e_rem_pio2.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_rem_pio2.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_rem_pio2f.c b/sysdeps/ia64/fpu/e_rem_pio2f.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/e_rem_pio2f.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_remainder.S b/sysdeps/ia64/fpu/e_remainder.S new file mode 100644 index 0000000000..c8aca1742f --- /dev/null +++ b/sysdeps/ia64/fpu/e_remainder.S @@ -0,0 +1,592 @@ + .file "remainder.asm" +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, Bob Norin, +// Shane Story, and Ping Tak Peter Tang of the Computational Software Lab, +// Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New Algorithm +// 4/04/00 Unwind support added +// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/29/00 Set FR_Y to f9 +// +// API +//==================================================================== +// double remainder(double,double); +// +// Overview of operation +//==================================================================== +// remainder(a,b)=a-i*b, +// where i is an integer such that, if b!=0 and a is finite, +// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. +// +// Algorithm +//==================================================================== +// a). eliminate special cases +// b). if |a/b|<0.25 (first quotient estimate), return a +// c). use single precision divide algorithm to get quotient q +// rounded to 24 bits of precision +// d). calculate partial remainders (using both q and q-ulp); +// select one and RZ(a/b) based on the sign of |a|-|b|*q +// e). if the exponent difference (exponent(a)-exponent(b)) +// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) +// and sticky bits to round to integer; exit loop and +// calculate final remainder +// f). if exponent(a)-exponent(b)>=24, select new value of a as +// the partial remainder calculated using RZ(a/b); +// repeat from c). +// +// Special cases +//==================================================================== +// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support +// a=NaN or b=NaN: return NaN + +#include "libm_support.h" + +// Registers used +//==================================================================== +// Predicate registers: p6-p14 +// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15,f32 + + .section .text + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + + + .proc remainder# + .align 32 + .global remainder# + .align 32 + +remainder: +#ifdef _LIBC +.global __remainder +.type __remainder,@function +__remainder: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f13=|a| + fmerge.s f13=f0,f8 + nop.i 0 +} + {.mfi + nop.m 0 + // f14=|b| + fmerge.s f14=f0,f9 + nop.i 0;; +} + {.mlx + mov r28=0x2ffdd + // r2=2^{23} + movl r3=0x4b000000;; +} + +// Y +-NAN, +-inf, +-0? p11 +{ .mfi + setf.exp f32=r28 +(p0) fclass.m.unc p11,p0 = f9, 0xe7 + nop.i 999 +} +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999;; +} + +{.mfi + nop.m 0 + mov f12=f0 + nop.i 0 +} +{ .mfi + // set p7=1 + cmp.eq.unc p7,p0=r0,r0 + // Step (1) + // y0 = 1 / b in f10 + frcpa.s1 f10,p6=f13,f14 + nop.i 0;; +} + +{.bbb + (p9) br.cond.spnt L(FREM_X_NAN_INF) + (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + nop.b 0 +} {.mfi + nop.m 0 + // set D flag if a (f8) is denormal + fnma.s0 f6=f8,f1,f8 + nop.i 0;; +} + + +L(remloop24): + { .mfi + nop.m 0 + // Step (2) + // q0 = a * y0 in f12 + (p6) fma.s1 f12=f13,f10,f0 + nop.i 0 +} { .mfi + nop.m 0 + // Step (3) + // e0 = 1 - b * y0 in f7 + (p6) fnma.s1 f7=f14,f10,f1 + nop.i 0;; +} {.mlx + nop.m 0 + // r2=1.25*2^{-24} + movl r2=0x33a00000;; +} + +{.mfi + nop.m 0 + // q1=q0*(1+e0) + fma.s1 f15=f12,f7,f12 + nop.i 0 +} +{ .mfi + nop.m 0 + // Step (4) + // e1 = e0 * e0 + E in f7 + (p6) fma.s1 f7=f7,f7,f32 + nop.i 0;; +} + {.mii + (p7) getf.exp r29=f12 + (p7) mov r28=0xfffd + nop.i 0;; +} + { .mfi + // f12=2^{23} + setf.s f12=r3 + // Step (5) + // q2 = q1 + e1 * q1 in f11 + (p6) fma.s.s1 f11=f7,f15,f15 + nop.i 0 +} { .mfi + nop.m 0 + // Step (6) + // q2 = q1 + e1 * q1 in f6 + (p6) fma.s1 f6=f7,f15,f15 + nop.i 0;; +} + + {.mmi + // f15=1.25*2^{-24} + setf.s f15=r2 + // q<1/4 ? (i.e. expon< -2) + (p7) cmp.gt p7,p0=r28,r29 + nop.i 0;; +} + +{.mfb + // r29= -32+bias + mov r29=0xffdf + // if |a/b|<1/4, set D flag before returning + (p7) fma.d.s0 f9=f9,f0,f8 + nop.b 0;; +} + {.mfb + nop.m 0 + // can be combined with bundle above if sign of 0 or + // FTZ enabled are not important + (p7) fmerge.s f8=f8,f9 + // return if |a|<4*|b| (estimated quotient < 1/4) + (p7) br.ret.spnt b0;; +} + {.mfi + // f7=2^{-32} + setf.exp f7=r29 + // set f8 to current a value | sign + fmerge.s f8=f8,f13 + nop.i 0;; +} + + + {.mfi + getf.exp r28=f6 + // last step ? (q<2^{23}) + fcmp.lt.unc.s1 p0,p12=f6,f12 + nop.i 0;; +} + {.mfi + nop.m 0 + // r=a-b*q + fnma.s1 f6=f14,f11,f13 + nop.i 0 +} {.mfi + // r2=23+bias + mov r2=0xffff+23 + // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) + fnma.s.s1 f15=f11,f15,f11 + nop.i 0;; +} + {.mmi + nop.m 0 + cmp.eq p11,p14=r2,r28 + nop.i 0;; +} + +.pred.rel "mutex",p11,p14 + {.mfi + nop.m 0 + // if exp_q=2^23, then r=a-b*2^{23} + (p11) fnma.s1 f13=f12,f14,f13 + nop.i 0 +} +{.mfi + nop.m 0 + // r2=a-b*q' + (p14) fnma.s1 f13=f14,f15,f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // r>0 iff q=RZ(a/b) and inexact + fcmp.gt.unc.s1 p8,p0=f6,f0 + nop.i 0 +} {.mfi + nop.m 0 + // r<0 iff q'=RZ(a/b) and inexact + (p14) fcmp.lt.unc.s1 p9,p10=f6,f0 + nop.i 0;; +} + +.pred.rel "mutex",p8,p9 + {.mfi + nop.m 0 + // (p8) Q=q+(last iteration ? sticky bits:0) + // i.e. Q=q+q*x (x=2^{-32} or 0) + (p8) fma.s1 f11=f11,f7,f11 + nop.i 0 +} {.mfi + nop.m 0 + // (p9) Q=q'+(last iteration ? sticky bits:0) + // i.e. Q=q'+q'*x (x=2^{-32} or 0) + (p9) fma.s1 f11=f15,f7,f15 + nop.i 0;; +} + + {.mfb + nop.m 0 + // (p9) set r=r2 (new a, if not last iteration) + // (p10) new a =r + (p10) mov f13=f6 + (p12) br.cond.sptk L(remloop24);; +} + +// last iteration + {.mfi + nop.m 0 + // set f9=|b|*sgn(a) + fmerge.s f9=f8,f9 + nop.i 0 +} + {.mfi + nop.m 0 + // round to integer + fcvt.fx.s1 f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // save sign of a + fmerge.s f7=f8,f8 + nop.i 0 +} {.mfi + nop.m 0 + // normalize + fcvt.xf f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // This can be removed if sign of 0 is not important + // get remainder using sf1 + fnma.d.s1 f12=f9,f11,f8 + nop.i 0 +} + {.mfi + nop.m 0 + // get remainder + fnma.d.s0 f8=f9,f11,f8 + nop.i 0;; +} + {.mfi + nop.m 0 + // f12=0? + // This can be removed if sign of 0 is not important + fcmp.eq.unc.s1 p8,p0=f12,f0 + nop.i 0;; +} + {.mfb + nop.m 0 + // if f8=0, set sign correctly + // This can be removed if sign of 0 is not important + (p8) fmerge.s f8=f7,f8 + // return + br.ret.sptk b0;; +} + + +L(FREM_X_NAN_INF): + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FREM_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8,p0 = f8, 0x23 + nop.i 999 +} +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11,p0 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m.unc p0,p8=f9,0xc3 + nop.i 0;; +} +{.mfi + nop.m 999 + // also set Denormal flag if necessary +(p8) fma.s0 f9=f9,f1,f0 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) frcpa.s0 f8,p7 = f8,f8 + nop.i 999 ;; +} + +{.mfi + nop.m 999 +(p11) mov f10=f8 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) fma.d f8=f8,f1,f0 + nop.i 0 ;; +} + +{ .mfb + nop.m 999 + frcpa.s0 f8,p7=f8,f9 + (p11) br.cond.spnt L(EXP_ERROR_RETURN);; +} +{ .mib + nop.m 0 + nop.i 0 + br.ret.spnt b0 ;; +} + + +L(FREM_Y_NAN_INF_ZERO): + +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma.d f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma.d f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FREM_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} + +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f0,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.d f8=f11,f1,f0 + nop.i 999 +} + + +L(EXP_ERROR_RETURN): + +{ .mib +(p0) mov GR_Parameter_TAG = 124 + nop.i 999 +(p0) br.sptk __libm_error_region;; +} + +.endp remainder +ASM_SIZE_DIRECTIVE(remainder) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__remainder) +#endif + + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_remainderf.S b/sysdeps/ia64/fpu/e_remainderf.S new file mode 100644 index 0000000000..e3b8b8a617 --- /dev/null +++ b/sysdeps/ia64/fpu/e_remainderf.S @@ -0,0 +1,611 @@ + .file "remainderf.asm" +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational +// Software Lab, +// Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New algorithm +// 4/04/00 Unwind support added +// 7/21/00 Fixed quotient=2^{24*m+23} bug +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/29/00 Set FR_Y to f9 +// +// API +//==================================================================== +// float remainderf(float,float); +// +// Overview of operation +//==================================================================== +// remainder(a,b)=a-i*b, +// where i is an integer such that, if b!=0 and a is finite, +// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. +// +// Algorithm +//==================================================================== +// a). eliminate special cases +// b). if |a/b|<0.25 (first quotient estimate), return a +// c). use single precision divide algorithm to get quotient q +// rounded to 24 bits of precision +// d). calculate partial remainders (using both q and q-ulp); +// select one and RZ(a/b) based on the sign of |a|-|b|*q +// e). if the exponent difference (exponent(a)-exponent(b)) +// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) +// and sticky bits to round to integer; exit loop and +// calculate final remainder +// f). if exponent(a)-exponent(b)>=24, select new value of a as +// the partial remainder calculated using RZ(a/b); +// repeat from c). +// +// Special cases +//==================================================================== +// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support +// a=NaN or b=NaN: return NaN + +#include "libm_support.h" + +// +// Registers used +//==================================================================== +// Predicate registers: p6-p12 +// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15 +// + +.section .text + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + + .proc remainderf# + .align 32 + .global remainderf# + .align 32 + +remainderf: +#ifdef _LIBC +.global __remainderf +.type __remainderf,@function +__remainderf: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f13=|a| + fmerge.s f13=f0,f8 + nop.i 0 +} + {.mfi + nop.m 0 + // f14=|b| + fmerge.s f14=f0,f9 + nop.i 0;; +} + {.mlx + nop.m 0 + // r2=2^{24}-2 + movl r3=0x4b7ffffe;; +} + +// Y +-NAN, +-inf, +-0? p11 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11,p0 = f9, 0xe7 + nop.i 999 +} +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999;; +} + +{.mfi + nop.m 0 + mov f15=f0 + nop.i 0 +} +{ .mfi + // set p7=1 + cmp.eq.unc p7,p0=r0,r0 + // Step (1) + // y0 = 1 / b in f10 + frcpa.s1 f10,p6=f13,f14 + nop.i 0;; +} +{.bbb + (p9) br.cond.spnt L(FREM_X_NAN_INF) + (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + nop.b 0 +} {.mfi + nop.m 0 + // set D flag if a (f8) is denormal + fnma.s0 f6=f8,f1,f8 + nop.i 0;; +} + +.align 32 +L(remloop24): + { .mfi + // f12=2^{24}-2 + setf.s f12=r3 + // Step (2) + // q0 = a * y0 in f15 + (p6) fma.s1 f15=f13,f10,f0 + nop.i 0 +} +{ .mfi + nop.m 0 + // Step (3) + // e0 = 1 - b * y0 in f7 + (p6) fnma.s1 f7=f14,f10,f1 + nop.i 0;; +} +{.mlx + nop.m 0 + // r2=1.25*2^{-24} + movl r2=0x33a00000;; +} + { .mfi + nop.m 0 + // Step (4) + // q1 = q0 + e0 * q0 in f6 + (p6) fma.s1 f6=f7,f15,f15 + nop.i 0 +} +{ .mfi + nop.m 0 + // Step (5) + // e1 = e0 * e0 in f7 + (p6) fma.s1 f7=f7,f7,f0 + nop.i 0;; +} + {.mii + (p7) getf.exp r29=f15 + (p7) mov r28=0xfffd + nop.i 0;; +} + + { .mfi + // f15=1.25*2^{-24} + setf.s f15=r2 + // Step (6) + // q2 = q1 + e1 * q1 in f6 + (p6) fma.s1 f6=f7,f6,f6 + nop.i 0 +} +{ .mfi + mov r2=0x3e7 + // Step (7) + // e2 = e1 * e1 in f7 + (p6) fma.s1 f7=f7,f7,f0 + nop.i 0;; +} + + {.mmi + // q<1/4 ? (i.e. expon< -2) + (p7) cmp.gt.unc p7,p0=r28,r29 + nop.m 0 + // r2=0x3e7000000 + shl r2=r2,24;; +} + +{.mfb + // r2=0x3e7000001 + add r2=1,r2 + // if |a/b|<1/4, set D flag before returning + (p7) fma.s.s0 f9=f9,f0,f8 + nop.b 0;; +} + {.mfb + nop.m 0 + // can be combined with bundle above if sign of 0 or + // FTZ enabled are not important + (p7) fmerge.s f8=f8,f9 + // return if |a|<4*|b| (estimated quotient < 1/4) + (p7) br.ret.spnt b0;; +} + {.mfi + nop.m 0 + // set f8 to current a value | sign + fmerge.s f8=f8,f13 + // r2=2^{-24}+2^{-48} (double prec.) + shl r2=r2,28;; +} + + +{ .mfi + // r29= -32+bias + mov r29=0xffdf + // Step (8) + // q3 = q2 + e2 * q2 in f6 + (p6) fma.d.s1 f6=f7,f6,f6 + nop.i 0;; +} +{ .mfi + nop.m 0 + // Step (9) + // q = q3 in f11 + (p6) fma.s.s1 f11=f6,f1,f0 + nop.i 0;; +} + {.mfi + // f7=2^{-24} + setf.d f7=r2 + // last step ? (q3<2^{24}-2 --> q<2^{24}) + fcmp.lt.unc.s1 p0,p12=f6,f12 + nop.i 0 +} {.mfi + // f12=2^{-32} + setf.exp f12=r29 + nop.f 0 + nop.i 0;; +} + {.mfi + nop.m 0 + // r=a-b*q + fnma.s1 f6=f14,f11,f13 + nop.i 0 +} +{.mfi + nop.m 0 + // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) + fnma.s.s1 f15=f11,f15,f11 + nop.i 0;; +} + + {.mfi + nop.m 0 + // r2=a-b*q' + fnma.s1 f13=f14,f15,f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // r>0 iff q=RZ(a/b) and inexact + fcmp.gt.unc.s1 p8,p0=f6,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // r<0 iff q'=RZ(a/b) and inexact + fcmp.lt.unc.s1 p9,p10=f6,f0 + nop.i 0;; +} +.pred.rel "mutex",p8,p9 + {.mfi + nop.m 0 + // (p8) Q=q+(last iteration ? sticky bits:0) + // i.e. Q=q+q*x (x=2^{-32} or 0) + (p8) fma.s1 f11=f11,f12,f11 + nop.i 0 +} +{.mfi + nop.m 0 + // (p9) Q=q'+(last iteration ? sticky bits:0) + // i.e. Q=q'+q'*x (x=2^{-24} or 0: if expon. difference=23, want to round back to q) + (p9) fma.s1 f11=f15,f7,f15 + nop.i 0;; +} + + {.mfb + nop.m 0 + // (p9) set r=r2 (new a, if not last iteration) + // (p10) new a =r + (p10) mov f13=f6 + (p12) br.cond.sptk L(remloop24);; +} + +// last iteration + {.mfi + nop.m 0 + // set f9=|b|*sgn(a) + fmerge.s f9=f8,f9 + nop.i 0 +} + {.mfi + nop.m 0 + // round to integer + fcvt.fx.s1 f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // save sign of a + fmerge.s f7=f8,f8 + nop.i 0 +} +{.mfi + nop.m 0 + // normalize + fcvt.xf f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // This can be removed if sign of 0 is not important + // get remainder using sf1 + fnma.s.s1 f12=f9,f11,f8 + nop.i 0 +} + {.mfi + nop.m 0 + // get remainder + fnma.s.s0 f8=f9,f11,f8 + nop.i 0;; +} + + + + {.mfi + nop.m 0 + // f12=0? + // This can be removed if sign of 0 is not important + fcmp.eq.unc.s1 p8,p0=f12,f0 + nop.i 0;; +} + {.mfb + nop.m 0 + // if f8=0, set sign correctly + // This can be removed if sign of 0 is not important + (p8) fmerge.s f8=f7,f8 + // return + br.ret.sptk b0;; +} + + +L(FREM_X_NAN_INF): + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FREM_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8,p0 = f8, 0x23 + nop.i 999 +} +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11,p0 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m.unc p0,p8=f9,0xc3 + nop.i 0;; +} +{.mfi + nop.m 999 + // also set Denormal flag if necessary +(p8) fma.s0 f9=f9,f1,f0 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) frcpa.s0 f8,p7 = f8,f8 + nop.i 999 ;; +} + +{.mfi + nop.m 999 +(p11) mov f10=f8 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) fma.s f8=f8,f1,f0 + nop.i 0 ;; +} + +{ .mfb + nop.m 999 + frcpa.s0 f8,p7=f8,f9 + (p11) br.cond.spnt L(EXP_ERROR_RETURN);; +} +{ .mib + nop.m 0 + nop.i 0 + br.ret.spnt b0 ;; +} + + +L(FREM_Y_NAN_INF_ZERO): + +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma.s f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0xc3 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma.s f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FREM_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} + +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f0,f0 +nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s f8=f11,f1,f0 + nop.i 999 +} + + +L(EXP_ERROR_RETURN): + +{ .mib +(p0) mov GR_Parameter_TAG = 125 + nop.i 999 +(p0) br.sptk __libm_error_region;; +} + +.endp remainderf +ASM_SIZE_DIRECTIVE(remainderf) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__remainderf) +#endif + + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support#;; // Call error handling function +} +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_remainderl.S b/sysdeps/ia64/fpu/e_remainderl.S new file mode 100644 index 0000000000..7a46575bfd --- /dev/null +++ b/sysdeps/ia64/fpu/e_remainderl.S @@ -0,0 +1,619 @@ +.file "remainderl.asm" +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational +// Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//==================================================================== +// 2/02/00 Initial version +// 3/02/00 New algorithm +// 4/04/00 Unwind support added +// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +//11/29/00 Set FR_Y to f9 +// +// API +//==================================================================== +// long double remainderl(long double,long double); +// +// Overview of operation +//==================================================================== +// remainder(a,b)=a-i*b, +// where i is an integer such that, if b!=0 and a is finite, +// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even. +// +// Algorithm +//==================================================================== +// a). eliminate special cases +// b). if |a/b|<0.25 (first quotient estimate), return a +// c). use single precision divide algorithm to get quotient q +// rounded to 24 bits of precision +// d). calculate partial remainders (using both q and q-ulp); +// select one and RZ(a/b) based on the sign of |a|-|b|*q +// e). if the exponent difference (exponent(a)-exponent(b)) +// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b) +// and sticky bits to round to integer; exit loop and +// calculate final remainder +// f). if exponent(a)-exponent(b)>=24, select new value of a as +// the partial remainder calculated using RZ(a/b); +// repeat from c). +// +// Special cases +//==================================================================== +// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support +// a=NaN or b=NaN: return NaN + +#include "libm_support.h" + +// +// Registers used +//==================================================================== +// Predicate registers: p6-p14 +// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 +// Floating point registers: f6-f15,f32 +// +.section .text + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f10 +FR_Y = f9 +FR_RESULT = f8 + + + + + .proc remainderl# + .align 32 + .global remainderl# + .align 32 + +remainderl: +#ifdef _LIBC +.global __remainderl +.type __remainderl,@function +__remainderl: +#endif +// inputs in f8, f9 +// result in f8 + +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + // f13=|a| + fmerge.s f13=f0,f8 + nop.i 0 +} + {.mfi + getf.sig r29=f9 + // f14=|b| + fmerge.s f14=f0,f9 + nop.i 0;; +} + {.mlx + mov r28=0x2ffdd + // r2=2^{23} + movl r3=0x4b000000;; +} + + +{.mmi +setf.exp f32=r28 +nop.m 0 +// y pseudo-zero ? +cmp.eq p11,p10=r29,r0;; +} + +// Y +-NAN, +-inf, +-0? p11 +{ .mfi + nop.m 999 +(p10) fclass.m p11,p10 = f9, 0xe7 + nop.i 999 +} +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p8 = f8, 0xe3 + nop.i 999;; +} + +{.mfi + nop.m 0 + mov f12=f0 + nop.i 0 +} +{ .mfi + // set p7=1 + cmp.eq.unc p7,p0=r0,r0 + // Step (1) + // y0 = 1 / b in f10 + frcpa.s1 f10,p6=f13,f14 + nop.i 0;; +} +// Y +-NAN, +-inf, +-0? p11 +{ .mfi + nop.m 999 + // pseudo-NaN ? +(p10) fclass.nm p11,p0 = f9, 0xff + nop.i 999 +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X +-NAN, +-inf, ? p9 + +{ .mfi + nop.m 999 +(p8) fclass.nm p9,p0 = f8, 0xff + nop.i 999;; +} + +{.bbb + (p9) br.cond.spnt L(FREM_X_NAN_INF) + (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + nop.b 0 +} {.mfi + nop.m 0 + // set D flag if a (f8) is denormal + fnma.s0 f6=f8,f1,f8 + nop.i 0;; +} + +L(remloop24): + { .mfi + nop.m 0 + // Step (2) + // q0 = a * y0 in f15 + (p6) fma.s1 f12=f13,f10,f0 + nop.i 0 +} { .mfi + nop.m 0 + // Step (3) + // e0 = 1 - b * y0 in f7 + (p6) fnma.s1 f7=f14,f10,f1 + nop.i 0;; +} {.mlx + nop.m 0 + // r2=1.25*2^{-24} + movl r2=0x33a00000;; +} + +{.mfi + nop.m 0 + // q1=q0*(1+e0) + fma.s1 f15=f12,f7,f12 + nop.i 0 +} +{ .mfi + nop.m 0 + // Step (4) + // e1 = e0 * e0 + E in f7 + (p6) fma.s1 f7=f7,f7,f32 + nop.i 0;; +} + {.mii + (p7) getf.exp r29=f12 + (p7) mov r28=0xfffd + nop.i 0;; +} + + { .mfi + // f12=2^{23} + setf.s f12=r3 + // Step (5) + // q2 = q1 + e1 * q1 in f11 + (p6) fma.s.s1 f11=f7,f15,f15 + nop.i 0 +} { .mfi + nop.m 0 + // Step (6) + // q2 = q1 + e1 * q1 in f6 + (p6) fma.s1 f6=f7,f15,f15 + nop.i 0;; +} + + {.mmi + // f15=1.25*2^{-24} + setf.s f15=r2 + // q<1/4 ? (i.e. expon< -2) + (p7) cmp.gt p7,p0=r28,r29 + nop.i 0;; +} + +{.mfb + // r29= -32+bias + mov r29=0xffdf + // if |a/b|<1/4, set D flag before returning + (p7) fma.s0 f9=f9,f0,f8 + nop.b 0;; +} + {.mfb + nop.m 0 + // can be combined with bundle above if sign of 0 or + // FTZ enabled are not important + (p7) fmerge.s f8=f8,f9 + // return if |a|<4*|b| (estimated quotient < 1/4) + (p7) br.ret.spnt b0;; +} + {.mfi + // f7=2^{-32} + setf.exp f7=r29 + // set f8 to current a value | sign + fmerge.s f8=f8,f13 + nop.i 0;; +} + {.mfi + getf.exp r28=f6 + // last step ? (q<2^{23}) + fcmp.lt.unc.s1 p0,p12=f6,f12 + nop.i 0;; +} + {.mfi + nop.m 0 + // r=a-b*q + fnma.s1 f6=f14,f11,f13 + nop.i 0 +} {.mfi + // r2=23+bias + mov r2=0xffff+23 + // q'=q-q*(1.25*2^{-24}) (q'=q-ulp) + fnma.s.s1 f15=f11,f15,f11 + nop.i 0;; +} + {.mmi + nop.m 0 + cmp.eq p11,p14=r2,r28 + nop.i 0;; +} + +.pred.rel "mutex",p11,p14 + {.mfi + nop.m 0 + // if exp_q=2^23, then r=a-b*2^{23} + (p11) fnma.s1 f13=f12,f14,f13 + nop.i 0 +} +{.mfi + nop.m 0 + // r2=a-b*q' + (p14) fnma.s1 f13=f14,f15,f13 + nop.i 0;; +} + {.mfi + nop.m 0 + // r>0 iff q=RZ(a/b) and inexact + fcmp.gt.unc.s1 p8,p0=f6,f0 + nop.i 0 +} {.mfi + nop.m 0 + // r<0 iff q'=RZ(a/b) and inexact + (p14) fcmp.lt.unc.s1 p9,p10=f6,f0 + nop.i 0;; +} + +.pred.rel "mutex",p8,p9 + {.mfi + nop.m 0 + // (p8) Q=q+(last iteration ? sticky bits:0) + // i.e. Q=q+q*x (x=2^{-32} or 0) + (p8) fma.s1 f11=f11,f7,f11 + nop.i 0 +} {.mfi + nop.m 0 + // (p9) Q=q'+(last iteration ? sticky bits:0) + // i.e. Q=q'+q'*x (x=2^{-32} or 0) + (p9) fma.s1 f11=f15,f7,f15 + nop.i 0;; +} + + {.mfb + nop.m 0 + // (p9) set r=r2 (new a, if not last iteration) + // (p10) new a =r + (p10) mov f13=f6 + (p12) br.cond.sptk L(remloop24);; +} + +// last iteration + {.mfi + nop.m 0 + // set f9=|b|*sgn(a) + fmerge.s f9=f8,f9 + nop.i 0 +} + {.mfi + nop.m 0 + // round to integer + fcvt.fx.s1 f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // save sign of a + fmerge.s f7=f8,f8 + nop.i 0 +} {.mfi + nop.m 0 + // normalize + fcvt.xf f11=f11 + nop.i 0;; +} + {.mfi + nop.m 0 + // This can be removed if sign of 0 is not important + // get remainder using sf1 + fnma.s1 f12=f9,f11,f8 + nop.i 0 +} + {.mfi + nop.m 0 + // get remainder + fnma.s0 f8=f9,f11,f8 + nop.i 0;; +} + {.mfi + nop.m 0 + // f12=0? + // This can be removed if sign of 0 is not important + fcmp.eq.unc.s1 p8,p0=f12,f0 + nop.i 0;; +} + {.mfb + nop.m 0 + // if f8=0, set sign correctly + // This can be removed if sign of 0 is not important + (p8) fmerge.s f8=f7,f8 + // return + br.ret.sptk b0;; +} + + + +L(FREM_X_NAN_INF): + +// Y zero ? +{.mfi + nop.m 0 + fma.s1 f10=f9,f1,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + fcmp.eq.unc.s1 p11,p0=f10,f0 + nop.i 0;; +} +{.mib + nop.m 0 + nop.i 0 + // if Y zero + (p11) br.cond.spnt L(FREM_Y_ZERO);; +} + +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8,p0 = f8, 0x23 + nop.i 999 +} +// X infinity? Return QNAN indefinite +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11,p0 = f8, 0x23 + nop.i 999;; +} +// Y NaN ? +{.mfi + nop.m 999 +(p8) fclass.m.unc p0,p8=f9,0xc3 + nop.i 0;; +} +{.mfi + nop.m 999 + // also set Denormal flag if necessary +(p8) fnma.s0 f9=f9,f1,f9 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) frcpa.s0 f8,p7 = f8,f8 + nop.i 999 ;; +} + +{.mfi + nop.m 999 +(p11) mov f10=f8 + nop.i 0 +} +{ .mfi + nop.m 999 +(p8) fma f8=f8,f1,f0 + nop.i 0 ;; +} + +{ .mfb + nop.m 999 + frcpa.s0 f8,p7=f8,f9 + (p11) br.cond.spnt L(EXP_ERROR_RETURN);; +} +{ .mib + nop.m 0 + nop.i 0 + br.ret.spnt b0 ;; +} + + +L(FREM_Y_NAN_INF_ZERO): +// Y INF +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x23 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p7) fma f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; +} + +// Y NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f9, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p0 = f9, 0xff + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p9) fma f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; +} + +L(FREM_Y_ZERO): +// Y zero? Must be zero at this point +// because it is the only choice left. +// Return QNAN indefinite + +// X NAN? +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p10 = f8, 0xc3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fclass.nm p9,p10 = f8, 0xff + nop.i 999 ;; +} + +{.mfi + nop.m 999 + (p9) frcpa f11,p7=f8,f0 + nop.i 0;; +} +{ .mfi + nop.m 999 +(p10) frcpa f11,p7 = f0,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8, f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma f8=f11,f1,f0 + nop.i 999;; +} + +L(EXP_ERROR_RETURN): + +{ .mib +(p0) mov GR_Parameter_TAG = 123 + nop.i 999 +(p0) br.sptk __libm_error_region;; +} + +.endp remainderl +ASM_SIZE_DIRECTIVE(remainderl) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__remainderl) +#endif + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_scalb.S b/sysdeps/ia64/fpu/e_scalb.S new file mode 100644 index 0000000000..60be3b3ffc --- /dev/null +++ b/sysdeps/ia64/fpu/e_scalb.S @@ -0,0 +1,551 @@ +.file "scalb.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 Scalb completely reworked and now standalone version +// +// API +//============================================================== +// double = scalb (double x, double n) +// input floating point f8 and floating point f9 +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Floating_X = f8 +FR_Result = f8 +FR_Floating_N = f9 +FR_Result2 = f9 +FR_Norm_N = f10 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_N_float_int = f13 +FR_Two_N = f14 +FR_Two_to_Big = f15 +FR_Big = f6 +FR_NBig = f7 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalb + +.section .text +.proc scalb +.align 32 + +scalb: +#ifdef _LIBC +.global __ieee754_scalb +.type __ieee754_scalb,@function +__ieee754_scalb: +#endif + +// +// Is x NAN, INF, ZERO, +-? +// +{ .mfi + alloc r32=ar.pfs,0,3,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch = 0x019C3F,r0 +} +// +// Is y a NAN, INF, ZERO, +-? +// +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = FR_Floating_N, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch1 = 0x063BF,r0 +} +;; + +// +// Convert N to a fp integer +// Normalize x +// +{ .mfi + nop.m 0 + fnorm.s1 FR_Norm_N = FR_Floating_N + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FR_Norm_X = FR_Floating_X + nop.i 999 +};; + +// +// Create 2*big +// Create 2**-big +// Normalize x +// Branch on special values. +// +{ .mib + setf.exp FR_Big = GR_Scratch + nop.i 0 +(p6) br.cond.spnt L(SCALB_NAN_INF_ZERO) +} +{ .mib + setf.exp FR_NBig = GR_Scratch1 + nop.i 0 +(p7) br.cond.spnt L(SCALB_NAN_INF_ZERO) +};; + +// +// Convert N to a fp integer +// Create -35000 +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + addl GR_NBig = -35000,r0 +} +;; + +// +// Put N if a GP register +// Convert N_float_int to floating point value +// Create 35000 +// Build the exponent Bias +// +{ .mii + getf.sig GR_N_as_int = FR_N_float_int + shl GR_Scratch = GR_Scratch,63 + addl GR_Big = 35000,r0 +} +{ .mfi + addl GR_Bias = 0x0FFFF,r0 + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Catch those fp values that are beyond 2**64-1 +// Is N > 35000 +// Is N < -35000 +// +{ .mfi + cmp.ne.unc p9,p10 = GR_N_as_int,GR_Scratch + nop.f 0 + nop.i 0 +} +{ .mmi + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig + nop.i 0 +};; + +// +// Is N really an int, only for those non-int indefinites? +// Create exp bias. +// +{ .mfi + add GR_N_Biased = GR_Bias,GR_N_as_int +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// Branch and return if N is not an int. +// Main path, create 2**N +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.i 999 +} +{ .mfb + nop.m 0 +(p7) frcpa f8,p11 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Set denormal on denormal input x and denormal input N +// +{ .mfi + nop.m 999 +(p10)fcmp.ge.s1 p6,p8 = FR_Norm_N,f0 + nop.i 0 +};; +{ .mfi + nop.m 999 + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fcmp.ge.s0 p12,p13 = FR_Floating_N,f0 + nop.i 0 +};; + +// +// Adjust 2**N if N was very small or very large +// + +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x00000000000303FF +};; +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x00000000000103FF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflow) +// S3 user supplied status + FZ + TD (Underflow) +// +// +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 53, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 54, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(SCALB_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(SCALB_OVERFLOW) +(p9) br.cond.spnt L(SCALB_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +L(SCALB_NAN_INF_ZERO): + +// +// Convert N to a fp integer +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + nop.i 999 +} +{ .mfi + nop.m 0 + fclass.m.unc p6,p0 = FR_Floating_N, 0xc3 //@snan | @qnan + nop.i 0 +};; +{ .mfi + nop.m 0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xc3 //@snan | @qnan + shl GR_Scratch = GR_Scratch,63 +};; +{ .mfi + nop.m 0 + fclass.m.unc p8,p0 = FR_Floating_N, 0x21 // @inf + nop.i 0 +} + { .mfi + nop.m 0 + fclass.m.unc p9,p0 = FR_Floating_N, 0x22 // @-inf + nop.i 0 +};; + +// +// Either X or N is a Nan, return result and possible raise invalid. +// +{ .mfb + nop.m 0 +(p6) fma.d.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p6) br.ret.spnt b0 +};; +{ .mfb + getf.sig GR_N_as_int = FR_N_float_int +(p7) fma.d.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p7) br.ret.spnt b0 +};; + +// +// If N + Inf do something special +// For N = -Inf, create Int +// +{ .mfb + nop.m 0 +(p8) fma.d.s0 FR_Result = FR_Floating_X, FR_Floating_N,f0 +(p8) br.ret.spnt b0 +} +{ .mfi + nop.m 0 +(p9) fnma.d.s0 FR_Floating_N = FR_Floating_N, f1, f0 + nop.i 0 +};; + +// +// If N==-Inf,return x/(-N) +// +{ .mfb + nop.m 0 +(p9) frcpa.s0 FR_Result,p6 = FR_Floating_X,FR_Floating_N +(p9) br.ret.spnt b0 +};; + +// +// Convert N_float_int to floating point value +// +{ .mfi + cmp.ne.unc p9,p0 = GR_N_as_int,GR_Scratch + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Is N an integer. +// +{ .mfi + nop.m 0 +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// If N not an int, return NaN and raise invalid. +// +{ .mfb + nop.m 0 +(p7) frcpa.s0 FR_Result,p6 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Always return x in other path. +// +{ .mfb + nop.m 0 + fma.d.s0 FR_Result = FR_Floating_X,f1,f0 + br.ret.sptk b0 +};; + +.endp scalb +ASM_SIZE_DIRECTIVE(scalb) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_scalb) +#endif +.proc __libm_error_region +__libm_error_region: + +L(SCALB_OVERFLOW): +L(SCALB_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + stfd [GR_Parameter_Y] = FR_Norm_N,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfd [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfd FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_scalbf.S b/sysdeps/ia64/fpu/e_scalbf.S new file mode 100644 index 0000000000..d4dfe5e1f4 --- /dev/null +++ b/sysdeps/ia64/fpu/e_scalbf.S @@ -0,0 +1,551 @@ +.file "scalbf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 Scalb completely reworked and now standalone version +// +// API +//============================================================== +// float = scalbf (float x, float n) +// input floating point f8 and floating point f9 +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Floating_X = f8 +FR_Result = f8 +FR_Floating_N = f9 +FR_Result2 = f9 +FR_Norm_N = f10 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_N_float_int = f13 +FR_Two_N = f14 +FR_Two_to_Big = f15 +FR_Big = f6 +FR_NBig = f7 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalbf + +.section .text +.proc scalbf +.align 32 + +scalbf: +#ifdef _LIBC +.global __ieee754_scalbf +.type __ieee754_scalbf,@function +__ieee754_scalbf: +#endif + +// +// Is x NAN, INF, ZERO, +-? +// +{ .mfi + alloc r32=ar.pfs,0,3,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch = 0x019C3F,r0 +} +// +// Is y a NAN, INF, ZERO, +-? +// +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = FR_Floating_N, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch1 = 0x063BF,r0 +} +;; + +// +// Convert N to a fp integer +// Normalize x +// +{ .mfi + nop.m 0 + fnorm.s1 FR_Norm_N = FR_Floating_N + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FR_Norm_X = FR_Floating_X + nop.i 999 +};; + +// +// Create 2*big +// Create 2**-big +// Normalize x +// Branch on special values. +// +{ .mib + setf.exp FR_Big = GR_Scratch + nop.i 0 +(p6) br.cond.spnt L(SCALBF_NAN_INF_ZERO) +} +{ .mib + setf.exp FR_NBig = GR_Scratch1 + nop.i 0 +(p7) br.cond.spnt L(SCALBF_NAN_INF_ZERO) +};; + +// +// Convert N to a fp integer +// Create -35000 +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + addl GR_NBig = -35000,r0 +} +;; + +// +// Put N if a GP register +// Convert N_float_int to floating point value +// Create 35000 +// Build the exponent Bias +// +{ .mii + getf.sig GR_N_as_int = FR_N_float_int + shl GR_Scratch = GR_Scratch,63 + addl GR_Big = 35000,r0 +} +{ .mfi + addl GR_Bias = 0x0FFFF,r0 + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Catch those fp values that are beyond 2**64-1 +// Is N > 35000 +// Is N < -35000 +// +{ .mfi + cmp.ne.unc p9,p10 = GR_N_as_int,GR_Scratch + nop.f 0 + nop.i 0 +} +{ .mmi + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig + nop.i 0 +};; + +// +// Is N really an int, only for those non-int indefinites? +// Create exp bias. +// +{ .mfi + add GR_N_Biased = GR_Bias,GR_N_as_int +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// Branch and return if N is not an int. +// Main path, create 2**N +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.i 999 +} +{ .mfb + nop.m 0 +(p7) frcpa f8,p11 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Set denormal on denormal input x and denormal input N +// +{ .mfi + nop.m 999 +(p10)fcmp.ge.s1 p6,p8 = FR_Norm_N,f0 + nop.i 0 +};; +{ .mfi + nop.m 999 + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fcmp.ge.s0 p12,p13 = FR_Floating_N,f0 + nop.i 0 +};; + +// +// Adjust 2**N if N was very small or very large +// + +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x000000000003007F +};; +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x000000000001007F +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflow) +// S3 user supplied status + FZ + TD (Underflow) +// +// +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 55, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 56, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(SCALBF_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(SCALBF_OVERFLOW) +(p9) br.cond.spnt L(SCALBF_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +L(SCALBF_NAN_INF_ZERO): + +// +// Convert N to a fp integer +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + nop.i 999 +} +{ .mfi + nop.m 0 + fclass.m.unc p6,p0 = FR_Floating_N, 0xc3 //@snan | @qnan + nop.i 0 +};; +{ .mfi + nop.m 0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xc3 //@snan | @qnan + shl GR_Scratch = GR_Scratch,63 +};; +{ .mfi + nop.m 0 + fclass.m.unc p8,p0 = FR_Floating_N, 0x21 // @inf + nop.i 0 +} + { .mfi + nop.m 0 + fclass.m.unc p9,p0 = FR_Floating_N, 0x22 // @-inf + nop.i 0 +};; + +// +// Either X or N is a Nan, return result and possible raise invalid. +// +{ .mfb + nop.m 0 +(p6) fma.s.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p6) br.ret.spnt b0 +};; +{ .mfb + getf.sig GR_N_as_int = FR_N_float_int +(p7) fma.s.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p7) br.ret.spnt b0 +};; + +// +// If N + Inf do something special +// For N = -Inf, create Int +// +{ .mfb + nop.m 0 +(p8) fma.s.s0 FR_Result = FR_Floating_X, FR_Floating_N,f0 +(p8) br.ret.spnt b0 +} +{ .mfi + nop.m 0 +(p9) fnma.s.s0 FR_Floating_N = FR_Floating_N, f1, f0 + nop.i 0 +};; + +// +// If N==-Inf,return x/(-N) +// +{ .mfb + nop.m 0 +(p9) frcpa.s0 FR_Result,p6 = FR_Floating_X,FR_Floating_N +(p9) br.ret.spnt b0 +};; + +// +// Convert N_float_int to floating point value +// +{ .mfi + cmp.ne.unc p9,p0 = GR_N_as_int,GR_Scratch + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Is N an integer. +// +{ .mfi + nop.m 0 +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// If N not an int, return NaN and raise invalid. +// +{ .mfb + nop.m 0 +(p7) frcpa.s0 FR_Result,p6 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Always return x in other path. +// +{ .mfb + nop.m 0 + fma.s.s0 FR_Result = FR_Floating_X,f1,f0 + br.ret.sptk b0 +};; + +.endp scalbf +ASM_SIZE_DIRECTIVE(scalbf) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_scalbf) +#endif +.proc __libm_error_region +__libm_error_region: + +L(SCALBF_OVERFLOW): +L(SCALBF_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + stfs [GR_Parameter_Y] = FR_Norm_N,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfs [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfs FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_scalbl.S b/sysdeps/ia64/fpu/e_scalbl.S new file mode 100644 index 0000000000..dd493fec71 --- /dev/null +++ b/sysdeps/ia64/fpu/e_scalbl.S @@ -0,0 +1,551 @@ +.file "scalbl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 Scalb completely reworked and now standalone version +// +// API +//============================================================== +// double-extended = scalbl (double-extended x, double-extended n) +// input floating point f8 and floating point f9 +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Floating_X = f8 +FR_Result = f8 +FR_Floating_N = f9 +FR_Result2 = f9 +FR_Norm_N = f10 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_N_float_int = f13 +FR_Two_N = f14 +FR_Two_to_Big = f15 +FR_Big = f6 +FR_NBig = f7 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalbl + +.section .text +.proc scalbl +.align 32 + +scalbl: +#ifdef _LIBC +.global __ieee754_scalbl +.type __ieee754_scalbl,@function +__ieee754_scalbl: +#endif + +// +// Is x NAN, INF, ZERO, +-? +// +{ .mfi + alloc r32=ar.pfs,0,3,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch = 0x019C3F,r0 +} +// +// Is y a NAN, INF, ZERO, +-? +// +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = FR_Floating_N, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Scratch1 = 0x063BF,r0 +} +;; + +// +// Convert N to a fp integer +// Normalize x +// +{ .mfi + nop.m 0 + fnorm.s1 FR_Norm_N = FR_Floating_N + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FR_Norm_X = FR_Floating_X + nop.i 999 +};; + +// +// Create 2*big +// Create 2**-big +// Normalize x +// Branch on special values. +// +{ .mib + setf.exp FR_Big = GR_Scratch + nop.i 0 +(p6) br.cond.spnt L(SCALBL_NAN_INF_ZERO) +} +{ .mib + setf.exp FR_NBig = GR_Scratch1 + nop.i 0 +(p7) br.cond.spnt L(SCALBL_NAN_INF_ZERO) +};; + +// +// Convert N to a fp integer +// Create -35000 +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + addl GR_NBig = -35000,r0 +} +;; + +// +// Put N if a GP register +// Convert N_float_int to floating point value +// Create 35000 +// Build the exponent Bias +// +{ .mii + getf.sig GR_N_as_int = FR_N_float_int + shl GR_Scratch = GR_Scratch,63 + addl GR_Big = 35000,r0 +} +{ .mfi + addl GR_Bias = 0x0FFFF,r0 + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Catch those fp values that are beyond 2**64-1 +// Is N > 35000 +// Is N < -35000 +// +{ .mfi + cmp.ne.unc p9,p10 = GR_N_as_int,GR_Scratch + nop.f 0 + nop.i 0 +} +{ .mmi + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig + nop.i 0 +};; + +// +// Is N really an int, only for those non-int indefinites? +// Create exp bias. +// +{ .mfi + add GR_N_Biased = GR_Bias,GR_N_as_int +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// Branch and return if N is not an int. +// Main path, create 2**N +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.i 999 +} +{ .mfb + nop.m 0 +(p7) frcpa f8,p11 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Set denormal on denormal input x and denormal input N +// +{ .mfi + nop.m 999 +(p10)fcmp.ge.s1 p6,p8 = FR_Norm_N,f0 + nop.i 0 +};; +{ .mfi + nop.m 999 + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fcmp.ge.s0 p12,p13 = FR_Floating_N,f0 + nop.i 0 +};; + +// +// Adjust 2**N if N was very small or very large +// + +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x0000000000033FFF +};; +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x0000000000013FFF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflow) +// S3 user supplied status + FZ + TD (Underflow) +// +// +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 51, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 52, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(SCALBL_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(SCALBL_OVERFLOW) +(p9) br.cond.spnt L(SCALBL_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +L(SCALBL_NAN_INF_ZERO): + +// +// Convert N to a fp integer +// +{ .mfi + addl GR_Scratch = 1,r0 + fcvt.fx.trunc.s1 FR_N_float_int = FR_Norm_N + nop.i 999 +} +{ .mfi + nop.m 0 + fclass.m.unc p6,p0 = FR_Floating_N, 0xc3 //@snan | @qnan + nop.i 0 +};; +{ .mfi + nop.m 0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xc3 //@snan | @qnan + shl GR_Scratch = GR_Scratch,63 +};; +{ .mfi + nop.m 0 + fclass.m.unc p8,p0 = FR_Floating_N, 0x21 // @inf + nop.i 0 +} + { .mfi + nop.m 0 + fclass.m.unc p9,p0 = FR_Floating_N, 0x22 // @-inf + nop.i 0 +};; + +// +// Either X or N is a Nan, return result and possible raise invalid. +// +{ .mfb + nop.m 0 +(p6) fma.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p6) br.ret.spnt b0 +};; +{ .mfb + getf.sig GR_N_as_int = FR_N_float_int +(p7) fma.s0 FR_Result = FR_Floating_N,FR_Floating_X,f0 +(p7) br.ret.spnt b0 +};; + +// +// If N + Inf do something special +// For N = -Inf, create Int +// +{ .mfb + nop.m 0 +(p8) fma.s0 FR_Result = FR_Floating_X, FR_Floating_N,f0 +(p8) br.ret.spnt b0 +} +{ .mfi + nop.m 0 +(p9) fnma.s0 FR_Floating_N = FR_Floating_N, f1, f0 + nop.i 0 +};; + +// +// If N==-Inf,return x/(-N) +// +{ .mfb + nop.m 0 +(p9) frcpa.s0 FR_Result,p6 = FR_Floating_X,FR_Floating_N +(p9) br.ret.spnt b0 +};; + +// +// Convert N_float_int to floating point value +// +{ .mfi + cmp.ne.unc p9,p0 = GR_N_as_int,GR_Scratch + fcvt.xf FR_N_float_int = FR_N_float_int + nop.i 0 +};; + +// +// Is N an integer. +// +{ .mfi + nop.m 0 +(p9) fcmp.neq.unc.s1 p7,p0 = FR_Norm_N, FR_N_float_int + nop.i 0 +};; + +// +// If N not an int, return NaN and raise invalid. +// +{ .mfb + nop.m 0 +(p7) frcpa.s0 FR_Result,p6 = f0,f0 +(p7) br.ret.spnt b0 +};; + +// +// Always return x in other path. +// +{ .mfb + nop.m 0 + fma.s0 FR_Result = FR_Floating_X,f1,f0 + br.ret.sptk b0 +};; + +.endp scalbl +ASM_SIZE_DIRECTIVE(scalbl) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_scalbl) +#endif +.proc __libm_error_region +__libm_error_region: + +L(SCALBL_OVERFLOW): +L(SCALBL_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + stfe [GR_Parameter_Y] = FR_Norm_N,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfe [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfe [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfe FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinh.S b/sysdeps/ia64/fpu/e_sinh.S new file mode 100644 index 0000000000..a478f4e0db --- /dev/null +++ b/sysdeps/ia64/fpu/e_sinh.S @@ -0,0 +1,1310 @@ +.file "sinh.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 10/12/00 Update to set denormal operand and underflow flags +// 1/22/01 Fixed to set inexact flag for small args. +// +// API +//============================================================== +// double = sinh(double) +// input floating point f8 +// output floating point f8 +// +// Registers used +//============================================================== +// general registers: +// r32 -> r47 +// predicate registers used: +// p6 p7 p8 p9 +// floating-point registers used: +// f9 -> f15; f32 -> f45; +// f8 has input, then output +// +// Overview of operation +//============================================================== +// There are four paths +// 1. |x| < 0.25 SINH_BY_POLY +// 2. |x| < 32 SINH_BY_TBL +// 3. |x| < 2^14 SINH_BY_EXP +// 4. |x_ >= 2^14 SINH_HUGE +// +// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea +// >= 1.0110001.... x 2^13 +// >= 11357.2166 +// +// But for double we get infinity for x >= 408633ce8fb9f87e +// >= 1.0110...x 2^9 +// >= +7.10476e+002 +// +// And for single we get infinity for x >= 42b3a496 +// >= 1.0110... 2^6 +// >= 89.8215 +// +// SAFE: If there is danger of overflow set SAFE to 0 +// NOT implemented: if there is danger of underflow, set SAFE to 0 +// SAFE for all paths listed below +// +// 1. SINH_BY_POLY +// =============== +// If |x| is less than the tiny threshold, then clear SAFE +// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 +// register-biased, this is fc01 +// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 +// If |x| < tiny threshold, set SAFE = 0 +// +// 2. SINH_BY_TBL +// ============= +// SAFE: SAFE is always 1 for TBL; +// +// 3. SINH_BY_EXP +// ============== +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// r34 has N-1; 16382 is in register biased form, 0x13ffd +// There is danger of double overflow if N-1 > 0x3fe +// in register biased form, 0x103fd +// Analagously, there is danger of single overflow if N-1 > 0x7e +// in register biased form, 0x1007d +// SAFE: If there is danger of overflow set SAFE to 0 +// +// 4. SINH_HUGE +// ============ +// SAFE: SAFE is always 0 for HUGE + +#include "libm_support.h" + +// +// Assembly macros +//============================================================== +sinh_FR_X = f44 +sinh_FR_X2 = f9 +sinh_FR_X4 = f10 +sinh_FR_SGNX = f40 +sinh_FR_all_ones = f45 +sinh_FR_tmp = f42 + +sinh_FR_Inv_log2by64 = f9 +sinh_FR_log2by64_lo = f11 +sinh_FR_log2by64_hi = f10 + +sinh_FR_A1 = f9 +sinh_FR_A2 = f10 +sinh_FR_A3 = f11 + +sinh_FR_Rcub = f12 +sinh_FR_M_temp = f13 +sinh_FR_R_temp = f13 +sinh_FR_Rsq = f13 +sinh_FR_R = f14 + +sinh_FR_M = f38 + +sinh_FR_B1 = f15 +sinh_FR_B2 = f32 +sinh_FR_B3 = f33 + +sinh_FR_peven_temp1 = f34 +sinh_FR_peven_temp2 = f35 +sinh_FR_peven = f36 + +sinh_FR_podd_temp1 = f34 +sinh_FR_podd_temp2 = f35 +sinh_FR_podd = f37 + +sinh_FR_poly_podd_temp1 = f11 +sinh_FR_poly_podd_temp2 = f13 +sinh_FR_poly_peven_temp1 = f11 +sinh_FR_poly_peven_temp2 = f13 + +sinh_FR_J_temp = f9 +sinh_FR_J = f10 + +sinh_FR_Mmj = f39 + +sinh_FR_N_temp1 = f11 +sinh_FR_N_temp2 = f12 +sinh_FR_N = f13 + +sinh_FR_spos = f14 +sinh_FR_sneg = f15 + +sinh_FR_Tjhi = f32 +sinh_FR_Tjlo = f33 +sinh_FR_Tmjhi = f34 +sinh_FR_Tmjlo = f35 + +sinh_GR_mJ = r35 +sinh_GR_J = r36 + +sinh_AD_mJ = r38 +sinh_AD_J = r39 +sinh_GR_all_ones = r40 + +sinh_FR_S_hi = f9 +sinh_FR_S_hi_temp = f10 +sinh_FR_S_lo_temp1 = f11 +sinh_FR_S_lo_temp2 = f12 +sinh_FR_S_lo_temp3 = f13 + +sinh_FR_S_lo = f38 +sinh_FR_C_hi = f39 + +sinh_FR_C_hi_temp1 = f10 +sinh_FR_Y_hi = f11 +sinh_FR_Y_lo_temp = f12 +sinh_FR_Y_lo = f13 +sinh_FR_SINH = f9 + +sinh_FR_P1 = f14 +sinh_FR_P2 = f15 +sinh_FR_P3 = f32 +sinh_FR_P4 = f33 +sinh_FR_P5 = f34 +sinh_FR_P6 = f35 + +sinh_FR_TINY_THRESH = f9 + +sinh_FR_SINH_temp = f10 +sinh_FR_SCALE = f11 + +sinh_FR_signed_hi_lo = f10 + + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_sinh_arg_reduction: +ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) + +double_sinh_p_table: +ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) + data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC + data8 0x8888888888888412, 0x00003FF8 + data8 0xD00D00D00D4D39F2, 0x00003FF2 + data8 0xB8EF1D28926D8891, 0x00003FEC + data8 0xD732377688025BE9, 0x00003FE5 + data8 0xB08AF9AE78C1239F, 0x00003FDE +ASM_SIZE_DIRECTIVE(double_sinh_p_table) + +double_sinh_ab_table: +ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(double_sinh_ab_table) + +double_sinh_j_table: +ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(double_sinh_j_table) + +.align 32 +.global sinh# + +.section .text +.proc sinh# +.align 32 + +sinh: +#ifdef _LIBC +.global __ieee754_sinh +.type __ieee754_sinh,@function +__ieee754_sinh: +#endif + +// X infinity or NAN? +// Take invalid fault if enabled + + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf + mov sinh_GR_all_ones = -1 +} +;; + + +{ .mfb + nop.m 999 +(p6) fma.d.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + +// Put 0.25 in f9; p6 true if x < 0.25 +// Make constant that will generate inexact when squared +{ .mlx + setf.sig sinh_FR_all_ones = sinh_GR_all_ones +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 +(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s sinh_FR_X = f0,f8 +(p7) br.ret.spnt b0 ;; +} + +// Identify denormal operands. +{ .mfi + nop.m 999 + fclass.m.unc p10,p0 = f8, 0x09 // + denorm + nop.i 999 +};; +{ .mfi + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x0a // - denorm + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s sinh_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(SINH_BY_TBL) ;; +} + + +L(SINH_BY_POLY): + +// POLY cannot overflow so there is no need to call __libm_error_support +// Set tiny_SAFE (p7) to 1(0) if answer is not tiny +// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is +// commented out. +//(p0) movl r32 = 0x000000000000fc01 +//(p0) setf.exp f10 = r32 +//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 +// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order +// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. +// Note that ax = |x| +// sinh(x) = sign * (series(e^x) - series(e^-x))/2 +// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) +// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) +// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) +// = sign * (ax + ax*p_odd + (ax*p_even)) +// = sign * (ax + Y_lo) +// sinh(x) = sign * (Y_hi + Y_lo) +// Get the values of P_x from the table +{ .mfb +(p0) addl r34 = @ltoff(double_sinh_p_table), gp +(p10) fma.d.s0 f8 = f8,f8,f8 +(p10) br.ret.spnt b0 +} +;; + +{ .mfb + ld8 r34 = [r34] +(p11) fnma.d.s0 f8 = f8,f8,f8 +(p11) br.ret.spnt b0 +} +;; + +// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe sinh_FR_P1 = [r34],16 +(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P2 = [r34],16 ;; +(p0) ldfe sinh_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P4 = [r34],16 ;; +(p0) ldfe sinh_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe sinh_FR_P6 = [r34],16 +(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo + nop.i 999 ;; +} +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} + +// Calculate f8 = sign * (Y_hi + Y_lo) +// Go to return +{ .mfb + nop.m 999 +(p0) fma.d.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// we want to leave now. +// Double-extended: +// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010009 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(SINH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// sinh(x) = sinh(B+R) +// = sinh(B)cosh(R) + cosh(B)sinh(R) +// +// ax = |x| = M*log2/64 + R +// B = M*log2/64 +// M = 64*N + j +// We will calcualte M and get N as (M-j)/64 +// The division is a shift. +// exp(B) = exp(N*log2 + j*log2/64) +// = 2^N * 2^(j*log2/64) +// sinh(B) = 1/2(e^B -e^-B) +// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) +// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) +// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) +// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 +// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) +// R = ax - M*log2/64 +// R = ax - M*log2_by_64_hi - M*log2_by_64_lo +// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) +// = 1 + p_odd + p_even +// where the p_even uses the A coefficients and the p_even uses the B coefficients +// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd +// cosh(R) = 1 + p_even +// sinh(B) = S_hi + S_lo +// cosh(B) = C_hi +// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + nop.m 999 +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe sinh_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe sinh_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_sinh_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// f38 = M = round-to-integer(x*Inv_log2by64) +// sinh_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in sinh_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = sinh_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j + +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = ax - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe sinh_FR_A3 = [r34],16 ;; +(p0) ldfe sinh_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_B2 = [r34],16 ;; +(p0) ldfe sinh_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(double_sinh_j_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r37 = [r37] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 +// Go to EXP if |x| >= 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp sinh_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp sinh_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub sinh_GR_mJ = r40, r36 +(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 +(p0) adds sinh_GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; +(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 +(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 +(p0) add sinh_AD_J = r37, sinh_GR_J ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; +(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 +(p7) br.cond.spnt L(SINH_BY_EXP) ;; +} + +{ .mfi + nop.m 999 + nop.f 999 + nop.i 999 ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate S_hi and S_lo +// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp + nop.i 999 +} + +// Calculate C_hi +// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 + nop.i 999 +} + +// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) +// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 + nop.i 999 ;; +} + +/////////// BUG FIX fma to fms -TK +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 + nop.i 999 ;; +} + +// Y_hi = S_hi +// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) +// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo +// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// sinh_FR_SINH = Y_hi + Y_lo +// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.d.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// Y_hi = Tjhi +// Y_lo = Tjhi * (p_odd + p_even) +Tjlo +// Scale = sign * 2^(N-1) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// There is a danger of double overflow if N-1 > 0x3fe = 1022 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x00000000000003fe ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 +(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo + nop.i 999 ;; +} + +// f8 = answer = scale * (Y_hi + Y_lo) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// 103FF => 103FF -FFFF = 400(true) +// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest +// double (7FE). So 0 103FF 8000000000000000 is one ulp more than +// largest double in register bias +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000103FF ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 127 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov r47 = 127 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov r47 = 127 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + +L(SINH_HUGE): + +// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 +(p0) mov r47 = 127 +} +.endp sinh +ASM_SIZE_DIRECTIVE(sinh) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_sinh) +#endif + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + +.proc __libm_error_region +__libm_error_region: +L(SINH_ERROR_SUPPORT): +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinhf.S b/sysdeps/ia64/fpu/e_sinhf.S new file mode 100644 index 0000000000..9b801d3720 --- /dev/null +++ b/sysdeps/ia64/fpu/e_sinhf.S @@ -0,0 +1,1311 @@ +.file "sinhf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 10/12/00 Update to set denormal operand and underflow flags +// 1/22/01 Fixed to set inexact flag for small args. +// +// API +//============================================================== +// float = sinhf(float) +// input floating point f8 +// output floating point f8 +// +// Registers used +//============================================================== +// general registers: +// r32 -> r47 +// predicate registers used: +// p6 p7 p8 p9 +// floating-point registers used: +// f9 -> f15; f32 -> f45; +// f8 has input, then output +// +// Overview of operation +//============================================================== +// There are four paths +// 1. |x| < 0.25 SINH_BY_POLY +// 2. |x| < 32 SINH_BY_TBL +// 3. |x| < 2^14 SINH_BY_EXP +// 4. |x_ >= 2^14 SINH_HUGE +// +// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea +// >= 1.0110001.... x 2^13 +// >= 11357.2166 +// +// But for double we get infinity for x >= 408633ce8fb9f87e +// >= 1.0110...x 2^9 +// >= +7.10476e+002 +// +// And for single we get infinity for x >= 42b3a496 +// >= 1.0110... 2^6 +// >= 89.8215 +// +// SAFE: If there is danger of overflow set SAFE to 0 +// NOT implemented: if there is danger of underflow, set SAFE to 0 +// SAFE for all paths listed below +// +// 1. SINH_BY_POLY +// =============== +// If |x| is less than the tiny threshold, then clear SAFE +// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 +// register-biased, this is fc01 +// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 +// If |x| < tiny threshold, set SAFE = 0 +// +// 2. SINH_BY_TBL +// ============= +// SAFE: SAFE is always 1 for TBL; +// +// 3. SINH_BY_EXP +// ============== +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// r34 has N-1; 16382 is in register biased form, 0x13ffd +// There is danger of double overflow if N-1 > 0x3fe +// in register biased form, 0x103fd +// Analagously, there is danger of single overflow if N-1 > 0x7e +// in register biased form, 0x1007d +// SAFE: If there is danger of overflow set SAFE to 0 +// +// 4. SINH_HUGE +// ============ +// SAFE: SAFE is always 0 for HUGE +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== +sinh_FR_X = f44 +sinh_FR_X2 = f9 +sinh_FR_X4 = f10 +sinh_FR_SGNX = f40 +sinh_FR_all_ones = f45 +sinh_FR_tmp = f42 + +sinh_FR_Inv_log2by64 = f9 +sinh_FR_log2by64_lo = f11 +sinh_FR_log2by64_hi = f10 + +sinh_FR_A1 = f9 +sinh_FR_A2 = f10 +sinh_FR_A3 = f11 + +sinh_FR_Rcub = f12 +sinh_FR_M_temp = f13 +sinh_FR_R_temp = f13 +sinh_FR_Rsq = f13 +sinh_FR_R = f14 + +sinh_FR_M = f38 + +sinh_FR_B1 = f15 +sinh_FR_B2 = f32 +sinh_FR_B3 = f33 + +sinh_FR_peven_temp1 = f34 +sinh_FR_peven_temp2 = f35 +sinh_FR_peven = f36 + +sinh_FR_podd_temp1 = f34 +sinh_FR_podd_temp2 = f35 +sinh_FR_podd = f37 + +sinh_FR_poly_podd_temp1 = f11 +sinh_FR_poly_podd_temp2 = f13 +sinh_FR_poly_peven_temp1 = f11 +sinh_FR_poly_peven_temp2 = f13 + +sinh_FR_J_temp = f9 +sinh_FR_J = f10 + +sinh_FR_Mmj = f39 + +sinh_FR_N_temp1 = f11 +sinh_FR_N_temp2 = f12 +sinh_FR_N = f13 + +sinh_FR_spos = f14 +sinh_FR_sneg = f15 + +sinh_FR_Tjhi = f32 +sinh_FR_Tjlo = f33 +sinh_FR_Tmjhi = f34 +sinh_FR_Tmjlo = f35 + +sinh_GR_mJ = r35 +sinh_GR_J = r36 + +sinh_AD_mJ = r38 +sinh_AD_J = r39 +sinh_GR_all_ones = r40 + +sinh_FR_S_hi = f9 +sinh_FR_S_hi_temp = f10 +sinh_FR_S_lo_temp1 = f11 +sinh_FR_S_lo_temp2 = f12 +sinh_FR_S_lo_temp3 = f13 + +sinh_FR_S_lo = f38 +sinh_FR_C_hi = f39 + +sinh_FR_C_hi_temp1 = f10 +sinh_FR_Y_hi = f11 +sinh_FR_Y_lo_temp = f12 +sinh_FR_Y_lo = f13 +sinh_FR_SINH = f9 + +sinh_FR_P1 = f14 +sinh_FR_P2 = f15 +sinh_FR_P3 = f32 +sinh_FR_P4 = f33 +sinh_FR_P5 = f34 +sinh_FR_P6 = f35 + +sinh_FR_TINY_THRESH = f9 + +sinh_FR_SINH_temp = f10 +sinh_FR_SCALE = f11 + +sinh_FR_signed_hi_lo = f10 + + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_sinh_arg_reduction: +ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) + +double_sinh_p_table: +ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) + data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC + data8 0x8888888888888412, 0x00003FF8 + data8 0xD00D00D00D4D39F2, 0x00003FF2 + data8 0xB8EF1D28926D8891, 0x00003FEC + data8 0xD732377688025BE9, 0x00003FE5 + data8 0xB08AF9AE78C1239F, 0x00003FDE +ASM_SIZE_DIRECTIVE(double_sinh_p_table) + +double_sinh_ab_table: +ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(double_sinh_ab_table) + +double_sinh_j_table: +ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(double_sinh_j_table) + +.align 32 +.global sinhf# + +.section .text +.proc sinhf# +.align 32 + +sinhf: +#ifdef _LIBC +.global __ieee754_sinhf +.type __ieee754_sinhf,@function +__ieee754_sinhf: +#endif + +// X infinity or NAN? +// Take invalid fault if enabled + + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf + mov sinh_GR_all_ones = -1 +} +;; + + +{ .mfb + nop.m 999 +(p6) fma.s.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + +// Put 0.25 in f9; p6 true if x < 0.25 +// Make constant that will generate inexact when squared +{ .mlx + setf.sig sinh_FR_all_ones = sinh_GR_all_ones +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 +(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s sinh_FR_X = f0,f8 +(p7) br.ret.spnt b0 ;; +} + +// Identify denormal operands. +{ .mfi + nop.m 999 + fclass.m.unc p10,p0 = f8, 0x09 // + denorm + nop.i 999 +};; +{ .mfi + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x0a // - denorm + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s sinh_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(SINH_BY_TBL) ;; +} + + +L(SINH_BY_POLY): + +// POLY cannot overflow so there is no need to call __libm_error_support +// Set tiny_SAFE (p7) to 1(0) if answer is not tiny +// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is +// commented out. +//(p0) movl r32 = 0x000000000000fc01 +//(p0) setf.exp f10 = r32 +//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 +// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order +// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. +// Note that ax = |x| +// sinh(x) = sign * (series(e^x) - series(e^-x))/2 +// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) +// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) +// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) +// = sign * (ax + ax*p_odd + (ax*p_even)) +// = sign * (ax + Y_lo) +// sinh(x) = sign * (Y_hi + Y_lo) +// Get the values of P_x from the table +{ .mfb +(p0) addl r34 = @ltoff(double_sinh_p_table), gp +(p10) fma.s.s0 f8 = f8,f8,f8 +(p10) br.ret.spnt b0 +} +;; + +{ .mfb + ld8 r34 = [r34] +(p11) fnma.s.s0 f8 = f8,f8,f8 +(p11) br.ret.spnt b0 +} +;; + +// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe sinh_FR_P1 = [r34],16 +(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P2 = [r34],16 ;; +(p0) ldfe sinh_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P4 = [r34],16 ;; +(p0) ldfe sinh_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe sinh_FR_P6 = [r34],16 +(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo + nop.i 999 ;; +} +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} + +// Calculate f8 = sign * (Y_hi + Y_lo) +// Go to return +{ .mfb + nop.m 999 +(p0) fma.s.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// we want to leave now. +// Double-extended: +// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010006 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(SINH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// sinh(x) = sinh(B+R) +// = sinh(B)cosh(R) + cosh(B)sinh(R) +// +// ax = |x| = M*log2/64 + R +// B = M*log2/64 +// M = 64*N + j +// We will calcualte M and get N as (M-j)/64 +// The division is a shift. +// exp(B) = exp(N*log2 + j*log2/64) +// = 2^N * 2^(j*log2/64) +// sinh(B) = 1/2(e^B -e^-B) +// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) +// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) +// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) +// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 +// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) +// R = ax - M*log2/64 +// R = ax - M*log2_by_64_hi - M*log2_by_64_lo +// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) +// = 1 + p_odd + p_even +// where the p_even uses the A coefficients and the p_even uses the B coefficients +// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd +// cosh(R) = 1 + p_even +// sinh(B) = S_hi + S_lo +// cosh(B) = C_hi +// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + nop.m 999 +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe sinh_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe sinh_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_sinh_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// f38 = M = round-to-integer(x*Inv_log2by64) +// sinh_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in sinh_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = sinh_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j + +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = ax - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe sinh_FR_A3 = [r34],16 ;; +(p0) ldfe sinh_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_B2 = [r34],16 ;; +(p0) ldfe sinh_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(double_sinh_j_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r37 = [r37] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 +// Go to EXP if |x| >= 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp sinh_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp sinh_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub sinh_GR_mJ = r40, r36 +(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 +(p0) adds sinh_GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; +(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 +(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 +(p0) add sinh_AD_J = r37, sinh_GR_J ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; +(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 +(p7) br.cond.spnt L(SINH_BY_EXP) ;; +} + +{ .mfi + nop.m 999 + nop.f 999 + nop.i 999 ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate S_hi and S_lo +// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp + nop.i 999 +} + +// Calculate C_hi +// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 + nop.i 999 +} + +// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) +// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 + nop.i 999 ;; +} + +/////////// BUG FIX fma to fms -TK +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 + nop.i 999 ;; +} + +// Y_hi = S_hi +// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) +// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo +// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// sinh_FR_SINH = Y_hi + Y_lo +// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// Y_hi = Tjhi +// Y_lo = Tjhi * (p_odd + p_even) +Tjlo +// Scale = sign * 2^(N-1) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// There is a danger of double overflow if N-1 > 0x3fe = 1022 +// There is a danger of single overflow if N-1 > 0x7e = 126 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000000007e ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 +(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo + nop.i 999 ;; +} + +// f8 = answer = scale * (Y_hi + Y_lo) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// 1007F => 1007F -FFFF = 80(true) +// 80 + 7F = FF, which is 1 more that the exponent of the largest +// double (FE). So 0 1007F 8000000000000000 is one ulp more than +// largest single in register bias +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000001007F ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 128 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov r47 = 128 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov r47 = 128 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + +L(SINH_HUGE): + +// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 +(p0) mov r47 = 128 +} +.endp sinhf +ASM_SIZE_DIRECTIVE(sinhf) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_sinhf) +#endif + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + +.proc __libm_error_region +__libm_error_region: +L(SINH_ERROR_SUPPORT): +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinhl.S b/sysdeps/ia64/fpu/e_sinhl.S new file mode 100644 index 0000000000..b697c48601 --- /dev/null +++ b/sysdeps/ia64/fpu/e_sinhl.S @@ -0,0 +1,1311 @@ +.file "sinhl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 10/12/00 Update to set denormal operand and underflow flags +// 1/22/01 Fixed to set inexact flag for small args. Fixed incorrect +// call to __libm_error_support for 710.476 < x < 11357.2166. +// +// API +//============================================================== +// long double = sinhl(long double) +// input floating point f8 +// output floating point f8 +// +// Registers used +//============================================================== +// general registers: +// r32 -> r47 +// predicate registers used: +// p6 p7 p8 p9 +// floating-point registers used: +// f9 -> f15; f32 -> f45; +// f8 has input, then output +// +// Overview of operation +//============================================================== +// There are four paths +// 1. |x| < 0.25 SINH_BY_POLY +// 2. |x| < 32 SINH_BY_TBL +// 3. |x| < 2^14 SINH_BY_EXP +// 4. |x_ >= 2^14 SINH_HUGE +// +// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea +// >= 1.0110001.... x 2^13 +// >= 11357.2166 +// +// But for double we get infinity for x >= 408633ce8fb9f87e +// >= 1.0110...x 2^9 +// >= +7.10476e+002 +// +// And for single we get infinity for x >= 42b3a496 +// >= 1.0110... 2^6 +// >= 89.8215 +// +// SAFE: If there is danger of overflow set SAFE to 0 +// NOT implemented: if there is danger of underflow, set SAFE to 0 +// SAFE for all paths listed below +// +// 1. SINH_BY_POLY +// =============== +// If |x| is less than the tiny threshold, then clear SAFE +// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 +// register-biased, this is fc01 +// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 +// If |x| < tiny threshold, set SAFE = 0 +// +// 2. SINH_BY_TBL +// ============= +// SAFE: SAFE is always 1 for TBL; +// +// 3. SINH_BY_EXP +// ============== +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// r34 has N-1; 16382 is in register biased form, 0x13ffd +// There is danger of double overflow if N-1 > 0x3fe +// in register biased form, 0x103fd +// Analagously, there is danger of single overflow if N-1 > 0x7e +// in register biased form, 0x1007d +// SAFE: If there is danger of overflow set SAFE to 0 +// +// 4. SINH_HUGE +// ============ +// SAFE: SAFE is always 0 for HUGE +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== +sinh_FR_X = f44 +sinh_FR_X2 = f9 +sinh_FR_X4 = f10 +sinh_FR_SGNX = f40 +sinh_FR_all_ones = f45 +sinh_FR_tmp = f42 + +sinh_FR_Inv_log2by64 = f9 +sinh_FR_log2by64_lo = f11 +sinh_FR_log2by64_hi = f10 + +sinh_FR_A1 = f9 +sinh_FR_A2 = f10 +sinh_FR_A3 = f11 + +sinh_FR_Rcub = f12 +sinh_FR_M_temp = f13 +sinh_FR_R_temp = f13 +sinh_FR_Rsq = f13 +sinh_FR_R = f14 + +sinh_FR_M = f38 + +sinh_FR_B1 = f15 +sinh_FR_B2 = f32 +sinh_FR_B3 = f33 + +sinh_FR_peven_temp1 = f34 +sinh_FR_peven_temp2 = f35 +sinh_FR_peven = f36 + +sinh_FR_podd_temp1 = f34 +sinh_FR_podd_temp2 = f35 +sinh_FR_podd = f37 + +sinh_FR_poly_podd_temp1 = f11 +sinh_FR_poly_podd_temp2 = f13 +sinh_FR_poly_peven_temp1 = f11 +sinh_FR_poly_peven_temp2 = f13 + +sinh_FR_J_temp = f9 +sinh_FR_J = f10 + +sinh_FR_Mmj = f39 + +sinh_FR_N_temp1 = f11 +sinh_FR_N_temp2 = f12 +sinh_FR_N = f13 + +sinh_FR_spos = f14 +sinh_FR_sneg = f15 + +sinh_FR_Tjhi = f32 +sinh_FR_Tjlo = f33 +sinh_FR_Tmjhi = f34 +sinh_FR_Tmjlo = f35 + +sinh_GR_mJ = r35 +sinh_GR_J = r36 + +sinh_AD_mJ = r38 +sinh_AD_J = r39 +sinh_GR_all_ones = r40 + +sinh_FR_S_hi = f9 +sinh_FR_S_hi_temp = f10 +sinh_FR_S_lo_temp1 = f11 +sinh_FR_S_lo_temp2 = f12 +sinh_FR_S_lo_temp3 = f13 + +sinh_FR_S_lo = f38 +sinh_FR_C_hi = f39 + +sinh_FR_C_hi_temp1 = f10 +sinh_FR_Y_hi = f11 +sinh_FR_Y_lo_temp = f12 +sinh_FR_Y_lo = f13 +sinh_FR_SINH = f9 + +sinh_FR_P1 = f14 +sinh_FR_P2 = f15 +sinh_FR_P3 = f32 +sinh_FR_P4 = f33 +sinh_FR_P5 = f34 +sinh_FR_P6 = f35 + +sinh_FR_TINY_THRESH = f9 + +sinh_FR_SINH_temp = f10 +sinh_FR_SCALE = f11 + +sinh_FR_signed_hi_lo = f10 + + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + +GR_Parameter_X = r44 +GR_Parameter_Y = r45 +GR_Parameter_RESULT = r46 + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_sinh_arg_reduction: +ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) + data8 0xB8AA3B295C17F0BC, 0x00004005 + data8 0xB17217F7D1000000, 0x00003FF8 + data8 0xCF79ABC9E3B39804, 0x00003FD0 +ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) + +double_sinh_p_table: +ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) + data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC + data8 0x8888888888888412, 0x00003FF8 + data8 0xD00D00D00D4D39F2, 0x00003FF2 + data8 0xB8EF1D28926D8891, 0x00003FEC + data8 0xD732377688025BE9, 0x00003FE5 + data8 0xB08AF9AE78C1239F, 0x00003FDE +ASM_SIZE_DIRECTIVE(double_sinh_p_table) + +double_sinh_ab_table: +ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC + data8 0x88888888884ECDD5, 0x00003FF8 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 + data8 0x8000000000000002, 0x00003FFE + data8 0xAAAAAAAAAA402C77, 0x00003FFA + data8 0xB60B6CC96BDB144D, 0x00003FF5 +ASM_SIZE_DIRECTIVE(double_sinh_ab_table) + +double_sinh_j_table: +ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) + data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 + data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 + data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 + data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 + data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 + data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 + data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 + data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 + data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 + data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 + data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 + data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 + data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 + data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 + data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 + data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 + data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 + data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 + data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 + data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 + data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 + data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 + data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 + data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 + data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 + data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 + data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 + data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 + data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 + data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 + data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 + data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 + data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 + data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 + data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 + data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 + data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 + data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 + data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 + data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 + data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 + data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 + data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 + data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 + data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 + data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 + data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 + data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 + data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 + data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 + data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 + data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 + data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 + data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 + data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 + data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 + data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 + data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 + data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 + data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 + data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 +ASM_SIZE_DIRECTIVE(double_sinh_j_table) + +.align 32 +.global sinhl# + +.section .text +.proc sinhl# +.align 32 + +sinhl: +#ifdef _LIBC +.global __ieee754_sinhl +.type __ieee754_sinhl,@function +__ieee754_sinhl: +#endif + +// X infinity or NAN? +// Take invalid fault if enabled + + +{ .mfi + alloc r32 = ar.pfs,0,12,4,0 +(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf + mov sinh_GR_all_ones = -1 +} +;; + + +{ .mfb + nop.m 999 +(p6) fma.s0 f8 = f8,f1,f8 +(p6) br.ret.spnt b0 ;; +} + +// Put 0.25 in f9; p6 true if x < 0.25 +// Make constant that will generate inexact when squared +{ .mlx + setf.sig sinh_FR_all_ones = sinh_GR_all_ones +(p0) movl r32 = 0x000000000000fffd ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 +(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s sinh_FR_X = f0,f8 +(p7) br.ret.spnt b0 ;; +} + +// Identify denormal operands. +{ .mfi + nop.m 999 + fclass.m.unc p10,p0 = f8, 0x09 // + denorm + nop.i 999 +};; +{ .mfi + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x0a // - denorm + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.s sinh_FR_SGNX = f8,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.sptk L(SINH_BY_TBL) ;; +} + + +L(SINH_BY_POLY): + +// POLY cannot overflow so there is no need to call __libm_error_support +// Set tiny_SAFE (p7) to 1(0) if answer is not tiny +// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is +// commented out. +//(p0) movl r32 = 0x000000000000fc01 +//(p0) setf.exp f10 = r32 +//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 +// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order +// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. +// Note that ax = |x| +// sinh(x) = sign * (series(e^x) - series(e^-x))/2 +// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) +// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) +// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) +// = sign * (ax + ax*p_odd + (ax*p_even)) +// = sign * (ax + Y_lo) +// sinh(x) = sign * (Y_hi + Y_lo) +// Get the values of P_x from the table +{ .mfb +(p0) addl r34 = @ltoff(double_sinh_p_table), gp +(p10) fma.s0 f8 = f8,f8,f8 +(p10) br.ret.spnt b0 +} +;; + +{ .mfb + ld8 r34 = [r34] +(p11) fnma.s0 f8 = f8,f8,f8 +(p11) br.ret.spnt b0 +} +;; + +// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax +{ .mmf + nop.m 999 +(p0) ldfe sinh_FR_P1 = [r34],16 +(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P2 = [r34],16 ;; +(p0) ldfe sinh_FR_P3 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_P4 = [r34],16 ;; +(p0) ldfe sinh_FR_P5 = [r34],16 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe sinh_FR_P6 = [r34],16 +(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 + nop.i 999 ;; +} + +// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo + nop.i 999 ;; +} +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} + +// Calculate f8 = sign * (Y_hi + Y_lo) +// Go to return +{ .mfb + nop.m 999 +(p0) fma.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_TBL): + +// Now that we are at TBL; so far all we know is that |x| >= 0.25. +// The first two steps are the same for TBL and EXP, but if we are HUGE +// we want to leave now. +// Double-extended: +// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) +// Double +// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) +// Single +// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x000000000001000d ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(SINH_HUGE) ;; +} + +// r32 = 1 +// r34 = N-1 +// r35 = N +// r36 = j +// r37 = N+1 + +// TBL can never overflow +// sinh(x) = sinh(B+R) +// = sinh(B)cosh(R) + cosh(B)sinh(R) +// +// ax = |x| = M*log2/64 + R +// B = M*log2/64 +// M = 64*N + j +// We will calcualte M and get N as (M-j)/64 +// The division is a shift. +// exp(B) = exp(N*log2 + j*log2/64) +// = 2^N * 2^(j*log2/64) +// sinh(B) = 1/2(e^B -e^-B) +// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) +// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) +// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) +// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 +// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) +// R = ax - M*log2/64 +// R = ax - M*log2_by_64_hi - M*log2_by_64_lo +// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) +// = 1 + p_odd + p_even +// where the p_even uses the A coefficients and the p_even uses the B coefficients +// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd +// cosh(R) = 1 + p_even +// sinh(B) = S_hi + S_lo +// cosh(B) = C_hi +// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) +// ****************************************************** +// STEP 1 (TBL and EXP) +// ****************************************************** +// Get the following constants. +// f9 = Inv_log2by64 +// f10 = log2by64_hi +// f11 = log2by64_lo + +{ .mmi +(p0) adds r32 = 0x1,r0 +(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and +// put them in an exponent. +// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) +// r39 = 0xffff + (N-1) = 0xffff +N -1 +// r40 = 0xffff - (N +1) = 0xffff -N -1 + +{ .mlx + nop.m 999 +(p0) movl r38 = 0x000000000000fffe ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; +(p0) ldfe sinh_FR_log2by64_hi = [r34],16 + nop.i 999 ;; +} + +{ .mbb +(p0) ldfe sinh_FR_log2by64_lo = [r34],16 + nop.b 999 + nop.b 999 ;; +} + +// Get the A coefficients +// f9 = A_1 +// f10 = A_2 +// f11 = A_3 + +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(double_sinh_ab_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r34 = [r34] + nop.m 999 + nop.i 999 +} +;; + + +// Calculate M and keep it as integer and floating point. +// f38 = M = round-to-integer(x*Inv_log2by64) +// sinh_FR_M = M = truncate(ax/(log2/64)) +// Put the significand of M in r35 +// and the floating point representation of M in sinh_FR_M + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A1 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r35 = sinh_FR_M_temp + nop.f 999 + nop.i 999 ;; +} + +// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// has a range of -32 thru 31. +// r35 = M +// r36 = j + +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) and r36 = 0x3f, r35 ;; +} + +// Calculate R +// f13 = f44 - f12*f10 = ax - M*log2by64_hi +// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X + nop.i 999 +} + +{ .mfi +(p0) ldfe sinh_FR_A2 = [r34],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp + nop.i 999 +} + +// Get the B coefficients +// f15 = B_1 +// f32 = B_2 +// f33 = B_3 + +{ .mmi +(p0) ldfe sinh_FR_A3 = [r34],16 ;; +(p0) ldfe sinh_FR_B1 = [r34],16 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe sinh_FR_B2 = [r34],16 ;; +(p0) ldfe sinh_FR_B3 = [r34],16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) shl r34 = r36, 0x2 ;; +(p0) sxt1 r37 = r34 ;; +} + +// ****************************************************** +// STEP 2 (TBL and EXP) +// ****************************************************** +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd +// f12 = R*R*R +// f13 = R*R +// f14 = R <== from above + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 +(p0) shr r36 = r37, 0x2 ;; +} + +// r34 = M-j = r35 - r36 +// r35 = N = (M-j)/64 + +{ .mii +(p0) sub r34 = r35, r36 + nop.i 999 ;; +(p0) shr r35 = r34, 0x6 ;; +} + +{ .mii +(p0) sub r40 = r38, r35 +(p0) adds r37 = 0x1, r35 +(p0) add r39 = r38, r35 ;; +} + +// Get the address of the J table, add the offset, +// addresses are sinh_AD_mJ and sinh_AD_J, get the T value +// f32 = T(j)_hi +// f33 = T(j)_lo +// f34 = T(-j)_hi +// f35 = T(-j)_lo + +{ .mmi +(p0) sub r34 = r35, r32 +(p0) addl r37 = @ltoff(double_sinh_j_table), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r37 = [r37] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 + nop.i 999 +} + +// ****************************************************** +// STEP 3 Now decide if we need to branch to EXP +// ****************************************************** +// Put 32 in f9; p6 true if x < 32 +// Go to EXP if |x| >= 32 + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000010004 ;; +} + +// Calculate p_even +// f34 = B_2 + Rsq *B_3 +// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) +// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 + nop.i 999 +} + +// Calculate p_odd +// f34 = A_2 + Rsq *A_3 +// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) +// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp sinh_FR_N_temp1 = r39 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 + nop.i 999 ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R + nop.i 999 +} + +// sinh_GR_mj contains the table offset for -j +// sinh_GR_j contains the table offset for +j +// p6 is true when j <= 0 + +{ .mlx +(p0) setf.exp sinh_FR_N_temp2 = r40 +(p0) movl r40 = 0x0000000000000020 ;; +} + +{ .mfi +(p0) sub sinh_GR_mJ = r40, r36 +(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 +(p0) adds sinh_GR_J = 0x20, r36 ;; +} + +{ .mii + nop.m 999 +(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; +(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; +} + +{ .mmi + nop.m 999 +(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 +(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; +} + +{ .mfi +(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 +(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 +(p0) add sinh_AD_J = r37, sinh_GR_J ;; +} + +{ .mmi +(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; +(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 +(p7) br.cond.spnt L(SINH_BY_EXP) ;; +} + +{ .mfi + nop.m 999 + nop.f 999 + nop.i 999 ;; +} + +// ****************************************************** +// If NOT branch to EXP +// ****************************************************** +// Calculate S_hi and S_lo +// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp +// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp + nop.i 999 +} + +// Calculate C_hi +// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi +// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 + nop.i 999 ;; +} + +// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) +// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) + +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 + nop.i 999 +} + +// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 +// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) +// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 + nop.i 999 ;; +} + +/////////// BUG FIX fma to fms -TK +{ .mfi + nop.m 999 +(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 + nop.i 999 ;; +} + +// Y_hi = S_hi +// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) +// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo +// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp + nop.i 999 ;; +} + +// sinh_FR_SINH = Y_hi + Y_lo +// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 +(p0) br.ret.sptk b0 ;; +} + + +L(SINH_BY_EXP): + +// When p7 is true, we know that an overflow is not going to happen +// When p7 is false, we must check for possible overflow +// p7 is the over_SAFE flag +// Y_hi = Tjhi +// Y_lo = Tjhi * (p_odd + p_even) +Tjlo +// Scale = sign * 2^(N-1) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) +// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd + nop.i 999 +} + +// Now we are in EXP. This is the only path where an overflow is possible +// but not for certain. So this is the only path where over_SAFE has any use. +// r34 still has N-1 +// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe +// There is a danger of double overflow if N-1 > 0x3fe = 1022 +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000003ffe ;; +} + +{ .mfi +(p0) cmp.gt.unc p0,p7 = r34, r32 +(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo + nop.i 999 ;; +} + +// f8 = answer = scale * (Y_hi + Y_lo) +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +// If over_SAFE is set, return +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = f44,f44 +(p7) br.ret.sptk b0 ;; +} + +// Else see if we overflowed +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// If WRE is set then an overflow will not occur in EXP. +// The input value that would cause a register (WRE) value to overflow is about 2^15 +// and this input would go into the HUGE path. +// Answer with WRE is in f43. + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 + nop.i 999 ;; +} + +// 13FFF => 13FFF -FFFF = 4000(true) +// 4000 + 3FFF = 7FFF, which is 1 more that the exponent of the largest +// long double (7FFE). So 0 13FFF 8000000000000000 is one ulp more than +// largest long double in register bias +// Now set p8 if the answer with WRE is greater than or equal this value +// Also set p9 if the answer with WRE is less than or equal to negative this value + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x00000000013FFF ;; +} + +{ .mmf + nop.m 999 +(p0) setf.exp f41 = r32 +(p0) fsetc.s2 0x7F,0x40 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f42 = f41, f41 + nop.i 999 ;; +} + +// The error tag for overflow is 126 +{ .mii + nop.m 999 + nop.i 999 ;; +(p8) mov r47 = 126 ;; +} + +{ .mfb + nop.m 999 +(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 +(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) mov r47 = 126 +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; +} + +// Dummy multiply to generate inexact +{ .mfi + nop.m 999 +(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmerge.s f8 = f44,f44 +(p0) br.ret.sptk b0 ;; +} + +L(SINH_HUGE): + +// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 +// SAFE: SAFE is always 0 for HUGE + +{ .mlx + nop.m 999 +(p0) movl r32 = 0x0000000000015dbf ;; +} + +{ .mfi +(p0) setf.exp f9 = r32 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 +(p0) mov r47 = 126 +} +.endp sinhl +ASM_SIZE_DIRECTIVE(sinhl) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__ieee754_sinhl) +#endif + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + +.proc __libm_error_region +__libm_error_region: +L(SINH_ERROR_SUPPORT): +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfe [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sqrt.S b/sysdeps/ia64/fpu/e_sqrt.S new file mode 100644 index 0000000000..ee6eb653f3 --- /dev/null +++ b/sysdeps/ia64/fpu/e_sqrt.S @@ -0,0 +1,347 @@ +.file "sqrt.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ******************************************************************** +// History +// ******************************************************************** +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ******************************************************************** +// +// Function: Combined sqrt(x), where +// _ +// sqrt(x) = |x, for double precision x values +// +// ******************************************************************** +// +// Accuracy: Correctly Rounded +// +// ******************************************************************** +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f7 -f14 +// +// General Purpose Registers: +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6, p7, p8 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// sqrt(QNaN) = QNaN +// sqrt(SNaN) = QNaN +// sqrt(+/-0) = +/-0 +// sqrt(negative) = QNaN and error handling is called +// +// ********************************************************************* +// +// Implementation: +// +// Modified Newton-Raphson Algorithm +// +// ********************************************************************* + +#include "libm_support.h" + +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 + + +.section .text +.proc sqrt# +.global sqrt# +.align 64 + +sqrt: +#ifdef _LIBC +.global __sqrt +.type __sqrt,@function +__sqrt: +.global __ieee754_sqrt +.type __ieee754_sqrt,@function +__ieee754_sqrt: +#endif +{ .mfi + alloc r32= ar.pfs,0,5,4,0 + frsqrta.s0 f7,p6=f8 + nop.i 0 +} { .mlx + // BEGIN DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM + nop.m 0 + // exponent of +1/2 in r2 + movl r2 = 0x0fffe;; +} { .mmi + // +1/2 in f9 + setf.exp f9 = r2 + nop.m 0 + nop.i 0 +} { .mlx + nop.m 0 + // 3/2 in r3 + movl r3=0x3fc00000;; +} { .mfi + setf.s f10=r3 + // Step (1) + // y0 = 1/sqrt(a) in f7 + fclass.m.unc p7,p8 = f8,0x3A + nop.i 0;; +} { .mlx + nop.m 0 + // 5/2 in r2 + movl r2 = 0x40200000 +} { .mlx + nop.m 0 + // 63/8 in r3 + movl r3 = 0x40fc0000;; +} { .mfi + setf.s f11=r2 + // Step (2) + // h = +1/2 * y0 in f6 + (p6) fma.s1 f6=f9,f7,f0 + nop.i 0 +} { .mfi + setf.s f12=r3 + // Step (3) + // g = a * y0 in f7 + (p6) fma.s1 f7=f8,f7,f0 + nop.i 0 +} { .mfi + nop.m 0 + mov f15 = f8 + nop.i 0;; +} { .mlx + nop.m 0 + // 231/16 in r2 + movl r2 = 0x41670000;; +} { .mfi + setf.s f13=r2 + // Step (4) + // e = 1/2 - g * h in f9 + (p6) fnma.s1 f9=f7,f6,f9 + nop.i 0 +} { .mlx + nop.m 0 + // 35/8 in r3 + movl r3 = 0x408c0000;; +} { .mfi + setf.s f14=r3 + // Step (5) + // S = 3/2 + 5/2 * e in f10 + (p6) fma.s1 f10=f11,f9,f10 + nop.i 0 +} { .mfi + nop.m 0 + // Step (6) + // e2 = e * e in f11 + (p6) fma.s1 f11=f9,f9,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (7) + // t = 63/8 + 231/16 * e in f12 + (p6) fma.s1 f12=f13,f9,f12 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (8) + // S1 = e + e2 * S in f10 + (p6) fma.s1 f10=f11,f10,f9 + nop.i 0 +} { .mfi + nop.m 0 + // Step (9) + // e4 = e2 * e2 in f11 + (p6) fma.s1 f11=f11,f11,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (10) + // t1 = 35/8 + e * t in f9 + (p6) fma.s1 f9=f9,f12,f14 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (11) + // G = g + S1 * g in f12 + (p6) fma.s1 f12=f10,f7,f7 + nop.i 0 +} { .mfi + nop.m 0 + // Step (12) + // E = g * e4 in f7 + (p6) fma.s1 f7=f7,f11,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (13) + // u = S1 + e4 * t1 in f10 + (p6) fma.s1 f10=f11,f9,f10 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (14) + // g1 = G + t1 * E in f7 + (p6) fma.d.s1 f7=f9,f7,f12 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (15) + // h1 = h + u * h in f6 + (p6) fma.s1 f6=f10,f6,f6 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (16) + // d = a - g1 * g1 in f9 + (p6) fnma.s1 f9=f7,f7,f8 + nop.i 0;; +} { .mfb + nop.m 0 + // Step (17) + // g2 = g1 + d * h1 in f7 + (p6) fma.d.s0 f8=f9,f6,f7 + (p6) br.ret.sptk b0 ;; +} + +{ .mfb + nop.m 0 + (p0) mov f8 = f7 + (p8) br.ret.sptk b0 ;; +} +{ .mfb + (p7) mov r40 = 49 + nop.f 0 + (p7) br.cond.sptk __libm_error_region ;; +} +// END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM +.endp sqrt# +ASM_SIZE_DIRECTIVE(sqrt) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__sqrt) +ASM_SIZE_DIRECTIVE(__ieee754_sqrt) +#endif + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + +.proc __libm_error_region +__libm_error_region: + +// +// This branch includes all those special values that are not negative, +// with the result equal to frcpa(x) +// + +.prologue +// We are distinguishing between over(under)flow and letting +// __libm_error_support set ERANGE or do anything else needed. + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f15 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sqrtf.S b/sysdeps/ia64/fpu/e_sqrtf.S new file mode 100644 index 0000000000..27d0bcf03d --- /dev/null +++ b/sysdeps/ia64/fpu/e_sqrtf.S @@ -0,0 +1,266 @@ +.file "sqrtf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// History: +// +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: Combined sqrtf(x), where +// _ +// sqrtf(x) = |x, for single precision x values +// +// ******************************************************************** +// +// Accuracy: Correctly Rounded +// +// ******************************************************************** +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f7 -f14 +// +// General Purpose Registers: +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6, p7, p8 +// +// ******************************************************************** +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// sqrtf(QNaN) = QNaN +// sqrtf(SNaN) = QNaN +// sqrtf(+/-0) = +/-0 +// sqrtf(negative) = QNaN and error handling is called +// +// ******************************************************************** +// +// Implementation: +// +// Modified Newton-Raphson Algorithm +// +// ******************************************************************** + +#include "libm_support.h" + +GR_SAVE_B0 = r34 +GR_SAVE_PFS = r33 +GR_SAVE_GP = r35 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f13 +FR_Y = f0 +FR_RESULT = f8 + + + +.section .text +.proc sqrtf# +.global sqrtf# +.align 64 + +sqrtf: +#ifdef _LIBC +.global __sqrtf +.type __sqrtf,@function +__sqrtf: +.global __ieee754_sqrtf +.type __ieee754_sqrtf,@function +__ieee754_sqrtf: +#endif +{ .mlx + // BEGIN SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM + alloc r32= ar.pfs,0,5,4,0 + // exponent of +1/2 in r2 + movl r2 = 0x0fffe +} { .mfi + // +1/2 in f12 + nop.m 0 + frsqrta.s0 f7,p6=f8 + nop.i 0;; +} { .mfi + setf.exp f12 = r2 + // Step (1) + // y0 = 1/sqrt(a) in f7 + fclass.m.unc p7,p8 = f8,0x3A + nop.i 0 +} { .mfi + nop.m 0 + // Make a copy of x just in case + mov f13 = f8 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (2) + // H0 = 1/2 * y0 in f9 + (p6) fma.s1 f9=f12,f7,f0 + nop.i 0 +} { .mfi + nop.m 0 + // Step (3) + // S0 = a * y0 in f7 + (p6) fma.s1 f7=f8,f7,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (4) + // d = 1/2 - S0 * H0 in f10 + (p6) fnma.s1 f10=f7,f9,f12 + nop.i 0 +} { .mfi + nop.m 0 + // Step (0'') + // 3/2 = 1 + 1/2 in f12 + (p6) fma.s1 f12=f12,f1,f1 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (5) + // e = 1 + 3/2 * d in f12 + (p6) fma.s1 f12=f12,f10,f1 + nop.i 0 +} { .mfi + nop.m 0 + // Step (6) + // T0 = d * S0 in f11 + (p6) fma.s1 f11=f10,f7,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (7) + // G0 = d * H0 in f10 + (p6) fma.s1 f10=f10,f9,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (8) + // S1 = S0 + e * T0 in f7 + (p6) fma.s.s1 f7=f12,f11,f7 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (9) + // H1 = H0 + e * G0 in f12 + (p6) fma.s1 f12=f12,f10,f9 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (10) + // d1 = a - S1 * S1 in f9 + (p6) fnma.s1 f9=f7,f7,f8 + nop.i 0;;; +} { .mfb + nop.m 0 + // Step (11) + // S = S1 + d1 * H1 in f7 + (p6) fma.s.s0 f8=f9,f12,f7 + (p6) br.ret.sptk b0 ;; +// END SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM +} { .mfb + nop.m 0 + (p0) mov f8 = f7 + (p8) br.ret.sptk b0 ;; +} +// +// This branch includes all those special values that are not negative, +// with the result equal to frcpa(x) +// +.endp sqrtf +ASM_SIZE_DIRECTIVE(sqrtf) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__sqrtf) +ASM_SIZE_DIRECTIVE(__ieee754_sqrtf) +#endif + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mii + add GR_Parameter_Y=-32,sp // Parameter 2 value +(p0) mov GR_Parameter_TAG = 50 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sqrtl.S b/sysdeps/ia64/fpu/e_sqrtl.S new file mode 100644 index 0000000000..4054cf05f3 --- /dev/null +++ b/sysdeps/ia64/fpu/e_sqrtl.S @@ -0,0 +1,281 @@ +.file "sqrtl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ******************************************************************** +// +// History: +// 2/02/00 (hand-optimized) +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ******************************************************************** +// +// Function: Combined sqrtl(x), where +// _ +// sqrtl(x) = |x, for double-extended precision x values +// +// ******************************************************************** +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f7 -f14 +// +// General Purpose Registers: +// r32-r36 (Locals) +// r37-r40 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6, p7, p8 +// +// ******************************************************************** +// +// IEEE Special Conditions: +// +// All faults and exceptions should be raised correctly. +// sqrtl(QNaN) = QNaN +// sqrtl(SNaN) = QNaN +// sqrtl(+/-0) = +/-0 +// sqrtl(negative) = QNaN and error handling is called +// +// ******************************************************************** +// +// Implementation: +// +// Modified Newton-Raphson Algorithm +// +// ******************************************************************** + +#include "libm_support.h" + +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + +FR_X = f15 +FR_Y = f0 +FR_RESULT = f8 + +.section .text +.proc sqrtl# +.global sqrtl# +.align 64 + +sqrtl: +#ifdef _LIBC +.global __sqrtl +.type __sqrtl,@function +__sqrtl: +.global __ieee754_sqrtl +.type __ieee754_sqrtl,@function +__ieee754_sqrtl: +#endif +{ .mlx +alloc r32= ar.pfs,0,5,4,0 + // exponent of +1/2 in r2 + movl r2 = 0x0fffe;; +} { .mfi + // +1/2 in f10 + setf.exp f12 = r2 + // Step (1) + // y0 = 1/sqrt(a) in f7 + frsqrta.s0 f7,p6=f8 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (2) + // H0 = +1/2 * y0 in f9 + (p6) fma.s1 f9=f12,f7,f0 + nop.i 0 +} { .mfi + nop.m 0 + // Step (3) + // S0 = a * y0 in f7 + (p6) fma.s1 f7=f8,f7,f0 + nop.i 0;; +} { .mfi + nop.m 0 + // Make copy input x + mov f13=f8 + nop.i 0 +} { .mfi + nop.m 0 + fclass.m.unc p7,p8 = f8,0x3A + nop.i 0;; +} { .mfi + nop.m 0 + // Step (4) + // d0 = 1/2 - S0 * H0 in f10 + (p6) fnma.s1 f10=f7,f9,f12 + nop.i 0;; +} +{ .mfi + nop.m 0 + (p0) mov f15=f8 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (5) + // H1 = H0 + d0 * H0 in f9 + (p6) fma.s1 f9=f10,f9,f9 + nop.i 0 +} { .mfi + nop.m 0 + // Step (6) + // S1 = S0 + d0 * S0 in f7 + (p6) fma.s1 f7=f10,f7,f7 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (7) + // d1 = 1/2 - S1 * H1 in f10 + (p6) fnma.s1 f10=f7,f9,f12 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (8) + // H2 = H1 + d1 * H1 in f9 + (p6) fma.s1 f9=f10,f9,f9 + nop.i 0 +} { .mfi + nop.m 0 + // Step (9) + // S2 = S1 + d1 * S1 in f7 + (p6) fma.s1 f7=f10,f7,f7 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (10) + // d2 = 1/2 - S2 * H2 in f10 + (p6) fnma.s1 f10=f7,f9,f12 + nop.i 0 +} { .mfi + nop.m 0 + // Step (11) + // e2 = a - S2 * S2 in f12 + (p6) fnma.s1 f12=f7,f7,f8 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (12) + // S3 = S2 + d2 * S2 in f7 + (p6) fma.s1 f7=f12,f9,f7 + nop.i 0 +} { .mfi + nop.m 0 + // Step (13) + // H3 = H2 + d2 * H2 in f9 + (p6) fma.s1 f9=f10,f9,f9 + nop.i 0;; +} { .mfi + nop.m 0 + // Step (14) + // e3 = a - S3 * S3 in f12 + (p6) fnma.s1 f12=f7,f7,f8 + nop.i 0;; +} { .mfb + nop.m 0 + // Step (15) + // S = S3 + e3 * H3 in f7 + (p6) fma.s0 f8=f12,f9,f7 + (p6) br.ret.sptk b0 ;; +} +{ .mfb + (p0) mov GR_Parameter_TAG = 48 + (p0) mov f8 = f7 + (p8) br.ret.sptk b0 ;; +} +// +// This branch includes all those special values that are not negative, +// with the result equal to frcpa(x) +// + + +// END DOUBLE EXTENDED PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM +.endp sqrtl# +ASM_SIZE_DIRECTIVE(sqrtl) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__sqrtl) +ASM_SIZE_DIRECTIVE(__ieee754_sqrtl) +#endif + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/k_rem_pio2.c b/sysdeps/ia64/fpu/k_rem_pio2.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/k_rem_pio2.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/k_rem_pio2f.c b/sysdeps/ia64/fpu/k_rem_pio2f.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/k_rem_pio2f.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/k_rem_pio2l.c b/sysdeps/ia64/fpu/k_rem_pio2l.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/k_rem_pio2l.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/libm-test-ulps b/sysdeps/ia64/fpu/libm-test-ulps index 022113f111..20510faefe 100644 --- a/sysdeps/ia64/fpu/libm-test-ulps +++ b/sysdeps/ia64/fpu/libm-test-ulps @@ -1,31 +1,16 @@ # Begin of automatic generation -# acos -Test "acos (0.7) == 0.7953988301841435554": -float: 1 -ifloat: 1 -ildouble: 1150 -ldouble: 1150 - # acosh Test "acosh (7) == 2.6339157938496334172": ldouble: 1 ildouble: 1 # asin -Test "asin (-0.5) == -pi/6": -float: 2 -ifloat: 2 -Test "asin (0.5) == pi/6": -float: 2 -ifloat: 2 Test "asin (0.7) == 0.77539749661075306374035335271498708": -float: 2 -ifloat: 2 double: 1 idouble: 1 -ildouble: 1147 -ldouble: 1147 +ldouble: 1 +ildouble: 1 # asinh Test "asinh (0.7) == 0.652666566082355786": @@ -33,17 +18,9 @@ ildouble: 656 ldouble: 656 # atan -Test "atan (0.7) == 0.6107259643892086165": -ildouble: 549 -ldouble: 549 - -# atan2 -Test "atan2 (0.4, 0.0003) == 1.5700463269355215718": -ildouble: 1 -ldouble: 1 -Test "atan2 (0.7, 1) == 0.6107259643892086165": -ildouble: 549 -ldouble: 549 +#Test "atan (0.7) == 0.6107259643892086165": +#ildouble: 549 +#ldouble: 549 # atanh Test "atanh (0.7) == 0.8673005276940531944": @@ -98,8 +75,8 @@ double: 1 float: 7 idouble: 1 ifloat: 7 -ildouble: 5 -ldouble: 5 +ildouble: 6 +ldouble: 6 Test "Imaginary part of: cacosh (-2 - 3 i) == -1.9833870299165354323 + 2.1414491111159960199 i": double: 1 float: 4 @@ -215,27 +192,9 @@ ildouble: 447 ldouble: 447 # cbrt -Test "cbrt (-0.001) == -0.1": -ildouble: 717 -ldouble: 717 -Test "cbrt (-27.0) == -3.0": -double: 1 -idouble: 1 -ildouble: 948 -ldouble: 948 Test "cbrt (0.7) == 0.8879040017426007084": double: 1 idouble: 1 -ildouble: 346 -ldouble: 346 -Test "cbrt (0.970299) == 0.99": -double: 1 -idouble: 1 -ildouble: 306 -ldouble: 306 -Test "cbrt (8) == 2": -ildouble: 191 -ldouble: 191 # ccos Test "Real part of: ccos (-2 - 3 i) == -4.1896256909688072301 - 9.1092278937553365979 i": @@ -453,15 +412,17 @@ ldouble: 0.25 # cosh Test "cosh (0.7) == 1.255169005630943018": -ildouble: 309 -ldouble: 309 +ildouble: 2 +ldouble: 2 # cpow Test "Real part of: cpow (2 + 3 i, 4 + 0 i) == -119.0 - 120.0 i": double: 1 -float: 4 +float: 5 idouble: 1 -ifloat: 4 +ifloat: 5 +ldouble: 1 +ildouble: 1 Test "Imaginary part of: cpow (2 + 3 i, 4 + 0 i) == -119.0 - 120.0 i": float: 2 ifloat: 2 @@ -475,6 +436,9 @@ idouble: 1.104 ifloat: 2.5333 ildouble: 1 ldouble: 1 +Test "Real part of: cpow (2 + 0 i, 10 + 0 i) == 1024.0 + 0.0 i": +ldouble: 1 +ildouble: 1 # csin Test "Real part of: csin (0.7 + 1.2 i) == 1.1664563419657581376 + 1.1544997246948547371 i": @@ -575,8 +539,8 @@ ldouble: 2 Test "Imaginary part of: ctanh (-2 - 3 i) == -0.9653858790221331242 + 0.0098843750383224937 i": float: 1 ifloat: 1 -ildouble: 23 -ldouble: 23 +ildouble: 24 +ldouble: 24 Test "Real part of: ctanh (0 + pi/4 i) == 0.0 + 1.0 i": Test "Imaginary part of: ctanh (0 + pi/4 i) == 0.0 + 1.0 i": float: 1 @@ -655,6 +619,8 @@ float: 1 ifloat: 1 double: 1 idouble: 1 +ldouble: 1 +ildouble: 1 # fmod Test "fmod (-6.5, -2.3) == -1.9": @@ -906,21 +872,17 @@ ildouble: 725 ldouble: 725 # sin -Test "sin (0.7) == 0.64421768723769105367": -ildouble: 627 -ldouble: 627 +Test "sin (0.7) == 0.64421768723769105367261435139872014": +ildouble: 1 +ldouble: 1 # sincos -Test "sincos (0.7, &sin_res, &cos_res) puts 0.64421768723769105367 in sin_res": -ildouble: 627 -ldouble: 627 +Test "sincos (0.7, &sin_res, &cos_res) puts 0.64421768723769105367261435139872014 in sin_res": +ldouble: 1 +ildouble: 1 Test "sincos (0.7, &sin_res, &cos_res) puts 0.76484218728448842625585999019186495 in cos_res": -float: 1 -ifloat: 1 double: 1 idouble: 1 -ildouble: 528 -ldouble: 528 Test "sincos (M_PI_6l*2.0, &sin_res, &cos_res) puts 0.5 in cos_res": double: 1 float: 0.5 @@ -1005,6 +967,8 @@ float: 1 ifloat: 1 double: 2 idouble: 2 +ldouble: 2 +ildouble: 2 Test "y0 (1.0) == 0.088256964215676957983": double: 2 float: 1 @@ -1028,6 +992,8 @@ float: 1 ifloat: 1 double: 1 idouble: 1 +ldouble: 1 +ildouble: 1 # y1 Test "y1 (0.1) == -6.4589510947020269877": @@ -1174,17 +1140,11 @@ idouble: 1 ifloat: 1 # Maximal error of functions: -Function: "acos": -ildouble: 1149 -ldouble: 1149 - Function: "asin": -float: 2 -ifloat: 2 double: 1 idouble: 1 -ildouble: 1147 -ldouble: 1147 +ldouble: 1 +ildouble: 1 Function: "asinh": double: 1 @@ -1192,14 +1152,6 @@ idouble: 1 ildouble: 656 ldouble: 656 -Function: "atan": -ildouble: 549 -ldouble: 549 - -Function: "atan2": -ildouble: 549 -ldouble: 549 - Function: "atanh": double: 1 idouble: 1 @@ -1305,8 +1257,6 @@ ldouble: 447 Function: "cbrt": double: 1 idouble: 1 -ildouble: 948 -ldouble: 948 Function: Real part of "ccos": double: 1 @@ -1389,22 +1339,24 @@ ildouble: 529 ldouble: 529 Function: "cosh": -ildouble: 309 -ldouble: 309 +ildouble: 2 +ldouble: 2 Function: Real part of "cpow": double: 1 -float: 4 +float: 5 idouble: 1 -ifloat: 4 +ifloat: 5 +ldouble: 1 +ildouble: 1 Function: Imaginary part of "cpow": double: 1.104 float: 2.5333 idouble: 1.104 ifloat: 2.5333 -ildouble: 2 -ldouble: 2 +ildouble: 4 +ldouble: 4 Function: Real part of "csin": float: 1 @@ -1639,6 +1591,8 @@ double: 2 float: 1 idouble: 2 ifloat: 1 +ldouble: 2 +ildouble: 2 Function: "y1": double: 3 diff --git a/sysdeps/ia64/fpu/libm_atan2_reg.S b/sysdeps/ia64/fpu/libm_atan2_reg.S new file mode 100644 index 0000000000..7a0c7034e9 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_atan2_reg.S @@ -0,0 +1,1221 @@ +.file "libm_atan2_reg.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00 Unwind support added + +#include "libm_support.h" + +.data + +.align 64 +ASM_TYPE_DIRECTIVE(Constants_atan#,@object) +Constants_atan: +data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000 +// double pi/2, single lo_pi/2, two**(-3) +data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1 +data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2 +data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3 +data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4 +data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5 +data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6 +data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7 +data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8 +data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1 +data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2 +data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3 +data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4 +// Entries Tbl_hi (double precision) +// B = 1+Index/16+1/32 Index = 0 +// Entries Tbl_lo (single precision) +// B = 1+Index/16+1/32 Index = 0 +data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000 +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-1)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) +data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000 +data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000 +data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000 +data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000 +data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000 +data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000 +data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000 +data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000 +data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000 +data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000 +data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000 +data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000 +data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000 +data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000 +data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000 +data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000 +// +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-2)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) +// +data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000 +data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000 +data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000 +data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000 +data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000 +data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000 +data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000 +data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000 +data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000 +data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000 +data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000 +data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000 +data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000 +data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000 +data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000 +data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000 +// +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-3)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) +// +data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000 +data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000 +data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000 +data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000 +data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000 +data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000 +data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000 +data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000 +data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000 +data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000 +data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000 +data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000 +data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000 +data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000 +data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000 +data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000 +data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // I two doubles +data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // I_by_2 two dbls +data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // I_by_4 two dbls +data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3I_by_4 two dbls +ASM_SIZE_DIRECTIVE(Constants_atan#) +.section .text + +.proc __libm_atan2_reg# +.global __libm_atan2_reg# +.align 64 +__libm_atan2_reg: + + +{ .mfi + alloc r32 = ar.pfs,0,20,4,0 +(p0) mov f32 = f8 + nop.i 0 +} +{ .mmi + nop.m 0 +(p0) addl r39 = @ltoff(Constants_atan#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r39 = [r39] + nop.m 999 + nop.i 999 +} +;; + +{ .mfi + nop 999 // EMbo added ... +(p0) mov f33 = f9 + nop.i 0 + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.nm.unc p9,p0 = f32 ,0x1FF + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.nm.unc p8,p0 = f33 ,0x1FF + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p6,p0 = f33 ,0x103 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p7,p0 = f32 ,0x103 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p12,p0 = f33 ,0x0C3 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// Check for NatVals. +// Check for EM Unsupporteds +// Check for NaNs. +// +(p0) fclass.m.unc p13,p0 = f32 ,0x0C3 +(p6) br.cond.sptk L(ATAN_NATVAL);; + } { .mbb + nop 999 // EMbo added ... +(p7) br.cond.sptk L(ATAN_NATVAL) +(p8) br.cond.sptk L(ATAN_UNSUPPORTED);; + } { .mib +(p0) add r40 = 96, r39 + nop 999 // EMbo added ... +(p9) br.cond.sptk L(ATAN_UNSUPPORTED);; + } { .mib +(p0) ldfd f50 = [r39],8 + nop 999 // EMbo added ... +(p12) br.cond.sptk L(ATAN_NAN);; + } { .mfb + nop 999 // EMbo added ... +(p0) fnorm.s1 f33 = f33 +(p13) br.cond.sptk L(ATAN_NAN);; + } { .mfi +(p0) ldfs f51 = [r39],4 +// +// Remove sign bits from exponents +// Load 2**(-3) +// Normalize the input argument. +// +(p0) fnorm.s1 f32 = f32 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) mov f82 = f1 + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +(p0) ldfs f78 = [r39],180 + nop 999;; // EMbo added ... + } { .mmi +(p0) getf.exp r36 = f33;; +// +// Get exp and sign of ArgX +// Get exp and sign of ArgY +// Load 2**(-3) and increment ptr to Q_4. +// +(p0) getf.exp r37 = f32 +(p0) shr.u r36 = r36,17;; + } { .mfi + nop 999 // EMbo added ... +(p0) fmerge.s f84 = f1,f32 +(p0) shr.u r37 = r37,17;; + } { .mfi + nop 999 // EMbo added ... +// +// ArgX_abs = |ArgX| +// ArgY_abs = |ArgY| +// sign_X is sign bit of ArgX +// sign_Y is sign bit of ArgY +// +(p0) fmerge.s f83 = f1,f33 +(p0) cmp.eq.unc p8,p9 = 0x00000, r37;; + } { .mfi + nop 999 // EMbo added ... +(p8) fadd.s1 f34 = f0, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p9) fsub.s1 f34 = f0, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmin.s1 f36 = f83, f84 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmax.s1 f35 = f83, f84 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// Is ArgX_abs >= ArgY_abs +// Is sign_Y == 0? +// +(p0) fcmp.ge.s1 p6,p7 = f83,f84 + nop 999;; // EMbo added ... + } { .mii +(p6) cmp.eq.unc p10, p11 = 0x00000, r36 +(p6) add r38 = r0, r0;; +// +// U = max(ArgX_abs,ArgY_abs) +// V = min(ArgX_abs,ArgY_abs) +// if p6, swap = 0 +// if p7, swap = 1 +// +// +// Let M = 1.0 +// if p8, s_Y = 1.0 +// if p9, s_Y = -1.0 +// +(p7) add r38 = 1,r0;; + } { .mfi + nop 999 // EMbo added ... +(p0) frcpa.s1 f37, p6 = f36, f35 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// E = frcpa(V,U) +// +(p10) fsub.s1 f82 = f82, f1 +(p6) br.cond.sptk L(ATAN_STEP2);; + } { .mib + nop 999 // EMbo added ... + nop 999 // EMbo added ... +// /**************************************************/ +// /********************* STEP2 **********************/ +// /**************************************************/ +(p0) br.cond.spnt L(ATAN_SPECIAL_HANDLING);; + } +L(ATAN_STEP2): + { .mlx + nop 999 // EMbo added ... +(p0) movl r47 = 0x8400000000000000 + } { .mlx + nop 999 // EMbo added ... +(p0) movl r48 = 0x0000000000000100;; + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f38 = f37, f36 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fcmp.lt.unc.s0 p0,p9 = f9,f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fcmp.lt.unc.s0 p0,p8 = f8,f1 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// Q = E * V +// +(p11) fadd.s1 f82 = f82, f1 + nop 999;; // EMbo added ... + } { .mfi +(p0) getf.sig r46 = f38 +(p0) fcmp.lt.unc p6,p7 = f38,f78 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f38 = f37, f36 +(p0) extr.u r42 = r46, 59, 4;; + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f50 = f82, f50 +(p0) dep r47 = r42, r47, 59, 4 + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f51 = f82, f51 + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +// +// Is Q < 2**(-3)? +// +// +// Do fcmp to raise any denormal operand +// exceptions. +// +(p0) getf.exp r45 = f38 + nop 999;; // EMbo added ... + } { .mib +// +// lookup = b_1 b_2 b_3 B_4 +// +// +// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// +(p0) andcm r41 = 0x0003, r45 + nop 999 // EMbo added ... +// +// We waited a few extra cycles so P_lo and P_hi could be calculated. +// Load the constant 256 for loading up table entries. +// +// /**************************************************/ +// /********************* STEP3 **********************/ +// /**************************************************/ +(p6) br.cond.spnt L(ATAN_POLY);; + } { .mii +(p0) setf.sig f39 = r47 +(p0) cmp.eq.unc p8, p9 = 0x0000, r41 +// +// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// point to beginning of Tbl_hi entries - k = 0. +// +(p0) add r40 = 16, r39 + } { .mmi +(p0) ldfe f73 = [r39],-16;; +(p9) sub r41 = r41,r0,1 +(p9) add r40 = 16,r40 + } { .mfi +(p8) ldfd f48 = [r40],8 +(p0) fmpy.s1 f50 = f34, f50 +(p0) xor r38 = r36,r38;; + } { .mmi +(p0) ldfe f71 = [r39],-16;; +(p8) ldfs f49 = [r40],8 +(p9) pmpy2.r r41 = r41,r48;; + } { .mfi +(p0) ldfe f69 = [r39],-16 +// +// Let z_hi have exponent and sign of original Q +// Load the Tbl_hi(0) else, increment pointer. +// +(p0) fmerge.se f39 = f38,f39 +(p9) shladd r42 = r42,0x0004,r41;; + } { .mmi +(p9) add r40 = r40, r42;; +(p9) ldfd f48 = [r40],8 + nop 999;; // EMbo added ... + } { .mmi +(p0) ldfe f67 = [r39],-16;; +(p9) ldfs f49 = [r40],8 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// U_prime_hi = U + V * z_hi +// Load the Tbl_lo(0) +// +(p0) fma.s1 f40 = f36, f39, f35 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fnma.s1 f42 = f35, f39, f36 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) mov f52 = f48 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) frcpa.s1 f43, p6 = f1, f40 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// U_prime_lo = U - U_prime_hi +// k = k * 256 - result can be 0, 256, or 512. +// +(p0) fsub.s1 f41 = f35, f40 +(p0) cmp.eq.unc p7, p6 = 0x00000, r38 + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f52 = f34, f52 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p7) fadd.s1 f54 = f0, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fsub.s1 f54 = f0, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fnma.s1 f80 = f43, f40, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.s1 f79 = f41, f40 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f41 = f36, f39, f41 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f56 = f54, f52, f50 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f43 = f80, f43, f43 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// U_prime_lo = U - U_hold +// lookup -> lookup * 16 + k +// +// +// V_prime = V - U * z_hi +// U_prime_lo = V * z_hi + U_prime_lo +// +(p0) fsub.s1 f79 = f35, f79 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fnma.s1 f80 = f43, f40, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// C_hi = frcpa(1,U_prime_hi) +// U_prime_lo = U_prime_lo + U_hold +// +// +// C_hi_hold = 1 - C_hi * U_prime_hi (1) +// +// +// C_hi = C_hi + C_hi * C_hi_hold (1) +// +// +// C_hi_hold = 1 - C_hi * U_prime_hi (2) +// +(p0) fadd.s1 f41 = f41, f79 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// C_hi = C_hi + C_hi * C_hi_hold (2) +// +(p0) fma.s1 f43 = f80, f43, f43 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// C_hi_hold = 1 - C_hi * U_prime_hi (3) +// +(p0) fnma.s1 f80 = f43, f40, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// C_hi = C_hi + C_hi * C_hi_hold (3) +// +(p0) fma.s1 f43 = f80, f43, f43 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// w_hi = V_prime * C_hi +// +(p0) fmpy.s1 f44 = f42, f43 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f46 = f44, f44 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// wsq = w_hi * w_hi +// w_lo = = V_prime - w_hi * U_prime_hi +// +(p0) fnma.s1 f45 = f44, f40, f42 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f47 = f46, f73, f71 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly = Q_3 + wsq * Q_4 +// w_lo = = w_lo - w_hi * U_prime_lo +// +(p0) fnma.s1 f45 = f44, f41, f45 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f47 = f46, f47, f69 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly = Q_2 + wsq * poly +// w_lo = = w_lo * C_hi +// +(p0) fmpy.s1 f45 = f43, f45 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f47 = f46, f47, f67 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly = Q_1 + wsq * poly +// A_lo = Tbl_lo + w_lo +// swap = xor(swap,sign_X) +// +(p0) fadd.s1 f53 = f49, f45 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// Is (swap) != 0 ? +// poly = wsq * poly +// A_hi = Tbl_hi +// +(p0) fmpy.s1 f47 = f46, f47 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly = wsq * poly +// +// +// if (p6) sigma = -1.0 +// if (p7) sigma = 1.0 +// +(p0) fmpy.s1 f47 = f44, f47 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// P_hi = s_Y * P_hi +// A_lo = A_lo + poly +// +(p0) fadd.s1 f53 = f53, f47 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// A_lo = A_lo + w_hi +// A_hi = s_Y * A_hi +// +(p0) fadd.s1 f53 = f53, f44 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// result_hi = P_hi + sigma * A_hi +// result_lo = P_lo + sigma * A_lo +// +(p0) fma.s1 f55 = f54, f53, f51 +(p0) br.cond.sptk L(RETURN_ATAN);; +} +// +// result = result_hi + result_lo * s_Y (User Supplied Rounding Mode) +// +// (p0) fma.d.s0 f57 = f55, f34, f56 +// +// /**************************************************/ +// /********************* STEP4 **********************/ +// /**************************************************/ +// +L(ATAN_POLY): +{ .mmi +(p0) xor r38 = r36,r38 +(p0) addl r39 = @ltoff(Constants_atan#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r39 = [r39] + nop.m 999 + nop.i 999 +} +;; + + +{ .mlx + nop 999 // EMbo added ... +(p0) movl r47 = 0x24005;; + } { .mfi +(p0) add r39 = 128, r39 +(p0) fnma.s1 f81 = f37, f35, f1 +(p0) cmp.eq.unc p7, p6 = 0x00000, r38;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f77 = [r39],-16 +// +// Iterate 3 times E = E + E*(1.0 - E*U) +// Also load P_8, P_7, P_6, P_5, P_4 +// E_hold = 1.0 - E * U (1) +// A_temp = Q +// +(p0) mov f85 = f38;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f76 = [r39],-16 +(p6) fsub.s1 f54 = f0, f1;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f75 = [r39],-16 +// +// E = E + E_hold*E (1) +// Point to P_8. +// +(p0) fma.s1 f37 = f37, f81, f37;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f74 = [r39],-16 +(p0) fnma.s1 f64 = f85, f35, f36;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f72 = [r39],-16 +(p7) fadd.s1 f54 = f0, f1;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f70 = [r39],-16 +// +// E_hold = 1.0 - E * U (2) +// +(p0) fnma.s1 f81 = f37, f35, f1;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f68 = [r39],-16 +(p0) fmpy.s1 f50 = f34, f50;; + } { .mmf + nop 999 // EMbo added ... +(p0) ldfe f66 = [r39],-16 +(p0) fmpy.d.s0 f67 = f67, f67 + } { .mfi + nop 999 // EMbo added ... +// +// E = E + E_hold*E (2) +// +(p0) fma.s1 f37 = f37, f81, f37 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// E_hold = 1.0 - E * U (3) +// +(p0) fnma.s1 f81 = f37, f35, f1 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// E = E + E_hold*E (3) +// At this point E approximates 1/U to roughly working precision +// z = V*E approximates V/U +// +(p0) fma.s1 f37 = f37, f81, f37 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// z = V * E +// +(p0) fmpy.s1 f59 = f36, f37 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f64 = f64, f37 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// zsq = z * z +// Also load P_3 +// +(p0) fmpy.s1 f60 = f59, f59 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.s1 f52 = f85, f64 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f62 = f60, f77, f76 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f63 = f60, f70, f68 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// z8 = zsq * zsq +// Also load P_2 +// +(p0) fmpy.s1 f61 = f60, f60 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fsub.s1 f85 = f85, f52 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmerge.s f65 = f52,f52 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f62 = f60, f62, f75 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f63 = f60, f63, f66 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// z8 = z8 * z8 +// Also load P_1 +// poly1 = _4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) +// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) +// +// +// poly1 = P_7 + zsq * P_8 +// poly2 = P_2 + zsq * P_3 +// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*poly1)) +// poly2 = zsq*(P_1 + zsq*poly2) +// +// +// poly1 = P_6 + zsq * poly1 +// poly2 = P_1 + zsq * poly2 +// poly1 = P_4 + zsq*(P_5 + zsq*poly1) +// poly2 = zsq*poly2 +// +(p0) fmpy.s1 f61 = f61, f61 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.s1 f64 = f85, f64 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f62 = f60, f62, f74 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly1 = P_5 + zsq * poly1 +// poly2 = zsq * poly2 +// poly1 = P_4 + zsq*poly1 +// +(p0) fmpy.s1 f63 = f63, f60 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly1 = P_4 + zsq * poly1 +// swap = xor(swap,sign_X) +// +(p0) fma.s1 f62 = f60, f62, f72 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// poly = z8*poly1 + poly2 (Typo in writeup) +// Is (swap) != 0 ? +// +// +// z_lo = V - A_temp * U +// if (p7) sigma = 1.0 +// Writeup shows A_temp as A_hi +// +// +// z_lo = z_lo * E +// if (p6) sigma = -1.0 +// z_lo = (V - A_temp * U) *E +// +// +// Fixup added to force inexact later - +// A_hi = A_temp + z_lo +// z_lo = (A_temp - A_hi) + z_lo +// z_lo = A_hi - z_lo -A_hi + z_lo = about 0 +// +(p0) fma.s1 f47 = f61, f62, f63 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// A_lo = z * poly + z_lo +// +(p0) fma.s1 f53 = f59, f47, f64 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.s1 f52 = f65, f53 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fsub.s1 f65 = f65, f52 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fmpy.s1 f52 = f34, f52 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.s1 f53 = f65, f53 + nop 999 // EMbo added ... + } { .mfi +(p0) setf.exp f65 = r47 +(p0) fma.s1 f56 = f54, f52, f50 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p6,p0 = f53,0x007 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// P_hi = s_Y * P_hi +// A_hi = s_Y * A_hi +// +// +// result_hi = P_hi + sigma * A_hi +// +(p6) mov f53 = f65 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// tmp = P_hi - result_hi +// +(p0) fsub.s1 f65 = f50, f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fma.s1 f65 = f52, f54, f65 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// tmp = sigma * A_hi + tmp +// sigma = A_lo * sigma + P_lo +// +(p0) fma.s1 f54 = f53, f54, f51 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// result_lo = s_Y * sigma + tmp +// +(p0) fma.s1 f55 = f34, f54, f65 + nop 999;; // EMbo added ... + } { .mfb + nop.m 0 + mov f34 = f1 +(p0) br.cond.sptk L(RETURN_ATAN);; +} +// +// result = result_hi + result_lo (User Supplied Rounding Mode) +// +// (p0) fadd.d.s0 f57 = f55, f56 +L(ATAN_UNSUPPORTED): +L(ATAN_NATVAL): + { .mfb + nop 999 // EMbo added ... +// +// Deal with the NatVal and unsupported cases. +// Raise invalid if warrented. +// +(p0) fmpy.d.s0 f57 = f8, f9 +br.cond.sptk L(RETURN_ATAN);; + } +L(ATAN_NAN): + { .mfb + nop 999 // EMbo added ... +// +// If only one NaN, then generate the resulting +// NaN and return - may raise invalid. +// +(p0) fmpy.d.s0 f57 = f8, f9 +(p0) br.cond.sptk L(RETURN_ATAN);; + } +L(ATAN_SPECIAL_HANDLING): + + { .mmf +(p0) addl r39 = @ltoff(Constants_atan#), gp + nop.m 999 +(p0) fcmp.lt.s0 p0,p7 = f8,f1 + } +;; + +// +// Raise denormal operand faults if necessary +// + +{ .mfi + ld8 r39 = [r39] +(p0) fcmp.lt.s0 p0,p6 = f9,f1 + nop 999;; // EMbo added ... +} +;; + + + +{ .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p6,p7 = f32,0x007 + nop 999;; // EMbo added ... + } { .mlx + nop 999 // EMbo added ... +(p0) movl r47 = 992;; + } { .mib +(p0) add r39 = r39, r47 + nop 999 // EMbo added ... +(p7) br.cond.sptk L(ATAN_ArgY_Not_ZERO);; + } { .mfi + nop 999 // EMbo added ... +(p6) fclass.m.unc p14,p0 = f33,0x035 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fclass.m.unc p15,p0 = f33,0x036 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fclass.m.unc p13,p0 = f33,0x007 + nop 999 // EMbo added ... + } { .mfi +(p0) ldfd f56 = [r39],8 + nop 999 // EMbo added ... + nop 999;; // EMbo added ... + } { .mfi +(p0) ldfd f55 = [r39],-8 +(p14) fmerge.s f56 = f32,f0 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// Return sign_Y * 0 when Y = +/-0 and X > 0 +// +(p14) fmerge.s f55 = f32,f0 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p15) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// Return sign_Y * PI when X < -0 +// +// +(p15) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fadd.d.s0 f57 = f56,f55 + nop.i 0 + } { .bbb +// +// Call error support function for atan(0,0) +// - expected value already computed. +// + nop.b 0 + nop.b 0 +(p0) br.cond.sptk L(RETURN_ATAN) + } +L(ATAN_ArgY_Not_ZERO): + { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p9,p10 = f32,0x023 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +(p9) fclass.m.unc p6,p0 = f33,0x017 +(p10) br.cond.sptk L(ATAN_ArgY_Not_INF);; + } { .mfi +(p6) add r39 = 16,r39 +(p9) fclass.m.unc p7,p0 = f33,0x021 + nop 999;; // EMbo added ... + } { .mmf + nop 999 // EMbo added ... +(p0) ldfd f56 = [r39],8 +(p9) fclass.m.unc p8,p0 = f33,0x022;; + } { .mbb +(p0) ldfd f55 = [r39],-8 + nop 999 // EMbo added ... + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// Load I/2 and adjust its sign. +// Return +I/2 when ArgY = +Inf and ArgX = +/-0,normal +// Return -I/2 when ArgY = -Inf and ArgX = +/-0,normal +// +(p6) fadd.d.s0 f57 = f56, f55 +(p6) br.cond.sptk L(RETURN_ATAN);; + } { .mmi +(p7) add r39 = 32,r39;; +(p7) ldfd f56 = [r39],8 + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +(p7) ldfd f55 = [r39],-8 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p7) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p7) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// Load PI/4 and adjust its sign. +// Return +PI/4 when ArgY = +Inf and ArgX = +Inf +// Return -PI/4 when ArgY = -Inf and ArgX = +Inf +// +(p7) fadd.d.s0 f57 = f56, f55 +(p7) br.cond.sptk L(RETURN_ATAN);; + } { .mmi +(p8) add r39 = 48,r39;; +(p8) ldfd f56 =[r39],8 + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +(p8) ldfd f55 =[r39],-8 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p8) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p8) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// Load I/4 and adjust its sign. +// Return +3I/4 when ArgY = +Inf and ArgX = -Inf +// Return -3I/4 when ArgY = -Inf and ArgX = -Inf +// +(p8) fadd.d.s0 f57 = f56, f55 +(p8) br.cond.sptk L(RETURN_ATAN);; + } +L(ATAN_ArgY_Not_INF): + { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p6,p0 = f33,0x007 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p7,p0 = f33,0x021 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p0) fclass.m.unc p8,p0 = f33,0x022 +(p6) add r39 = 16,r39;; + } { .mfi +(p6) ldfd f56 =[r39],8 + nop 999 // EMbo added ... + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +(p6) ldfd f55 =[r39],-8 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p6) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// return = sign_Y * I/2 when ArgX = +/-0 +// +(p6) fadd.d.s0 f57 = f56, f55 +(p6) br.cond.sptk L(RETURN_ATAN);; + } { .mfi + nop 999 // EMbo added ... +(p7) fmerge.s f56 = f32,f0 + nop 999 // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p7) fmerge.s f55 = f32,f0 + nop 999;; // EMbo added ... + } { .mfb + nop 999 // EMbo added ... +// +// return = sign_Y * 0 when ArgX = Inf +// +(p7) fadd.d.s0 f57 = f56, f55 +(p7) br.cond.sptk L(RETURN_ATAN);; + } { .mfi +(p8) ldfd f56 = [r39],8 + nop 999 // EMbo added ... + nop 999;; // EMbo added ... + } { .mmi + nop 999;; // EMbo added ... +(p8) ldfd f55 = [r39],-8 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p8) fmerge.s f56 = f32,f56 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +(p8) fmerge.s f55 = f32,f55 + nop 999;; // EMbo added ... + } { .mfi + nop 999 // EMbo added ... +// +// return = sign_Y * I when ArgX = -Inf +// +(p8) fadd.d.s0 f57 = f56, f55 + nop 999 // EMbo added ... + };; +L(RETURN_ATAN): +// mov f8 = f57 ;; +// The answer is in f57. +// But Z_hi is f56 +// Z_lo is f55 +// s_Y is f34 +// W is in f9 and untouched + +{ .mfi + nop 999 +mov f8 = f56 + nop.i 0 +};; + +{ .mfi + nop 999 +mov f10 = f55 + nop.i 999 +} +{ .mfb + nop 999 +mov f11 = f34 +br.ret.sptk b0 +};; + +.endp __libm_atan2_reg +ASM_SIZE_DIRECTIVE(__libm_atan2_reg) diff --git a/sysdeps/ia64/fpu/libm_error.c b/sysdeps/ia64/fpu/libm_error.c new file mode 100644 index 0000000000..26916fd110 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_error.c @@ -0,0 +1,3545 @@ +// +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, James +// Edwards, and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. + +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 3/22/00: Updated to support flexible and dynamic error handling. +// 8/16/00: Changed all matherr function-calls to use the pmatherr +// function-pointers. +// 10/03/00: Corrected a scalb type. +// 11/28/00: Changed INPUT_XL to INPUT_XD for scalb_underflow case. +// 12/07/00: Added code to make scalbn error support equivalent to ldexp. +// 2/07/01: Added __declspec(align(16)) to long double constants to correct +// alignment problem. +// + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include "libm_support.h" + +#ifndef _LIBC +_LIB_VERSION_TYPE +#if defined( __POSIX__ ) +_LIB_VERSION = _POSIX_; +#elif defined( __XOPEN__ ) +_LIB_VERSION = _XOPEN_; +#elif defined( __SVID__ ) +_LIB_VERSION = _SVID_; +#elif defined( __IEEE__ ) +_LIB_VERSION = _IEEE_; +#else +_LIB_VERSION = _ISOC_; +#endif +#endif + +/************************************************************/ +/* matherrX function pointers and setusermatherrX functions */ +/************************************************************/ +#if 0 +int (*pmatherrf)(struct exceptionf*) = MATHERR_F; +int (*pmatherr)(struct EXC_DECL_D*) = MATHERR_D; +int (*pmatherrl)(struct exceptionl*) = matherrl; + +void __libm_setusermatherrf( int(*user_merrf)(struct exceptionf*) ) +{ pmatherrf = ( (user_merrf==NULL)? (MATHERR_F) : (user_merrf) ); } + +void __libm_setusermatherr( int(*user_merr)(struct EXC_DECL_D*) ) +{ pmatherr = ( (user_merr==NULL)? (MATHERR_D) : (user_merr) ); } + +void __libm_setusermatherrl( int(*user_merrl)(struct exceptionl*) ) +{ pmatherrl = ( (user_merrl==NULL)? (matherrl) : (user_merrl) ); } +#endif + +/***********************************************/ +/* error-handling function, libm_error_support */ +/***********************************************/ +void __libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag) +{ + + +# ifdef __cplusplus +struct __exception exc; +# else +struct exception exc; +# endif + +struct exceptionf excf; +struct exceptionl excl; + +# if defined opensource || defined _LIBC +#define ALIGNIT +#define ALIGNATTR __attribute__ ((__aligned__ (16))) +# else +#define ALIGNIT __declspec(align(16)) +#define ALIGNATTR +# endif + +const char float_inf[4] = {0x00,0x00,0x80,0x7F}; +const char float_huge[4] = {0xFF,0xFF,0x7F,0x7F}; +const char float_zero[4] = {0x00,0x00,0x00,0x00}; +const char float_neg_inf[4] = {0x00,0x00,0x80,0xFF}; +const char float_neg_huge[4] = {0xFF,0xFF,0x7F,0xFF}; +const char float_neg_zero[4] = {0x00,0x00,0x00,0x80}; +ALIGNIT +const char double_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0x7F}; +ALIGNIT +//const char double_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0x7F}; +ALIGNIT +const char double_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +const char double_neg_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF}; +ALIGNIT +//const char double_neg_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0xFF}; +ALIGNIT +const char double_neg_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80}; +ALIGNIT +const char long_double_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +//const char long_double_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +const char long_double_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +const char long_double_neg_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +//const char long_double_neg_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFE,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +ALIGNIT +const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x80,0x00,0x00,0x00,0x00,0x00,0x00}; + +#define RETVAL_HUGE_VALL *(long double *)retval = *(long double *)long_double_inf +#define RETVAL_NEG_HUGE_VALL *(long double *)retval = *(long double *)long_double_neg_inf +#define RETVAL_HUGEL *(long double *)retval = (long double)*(float *)float_huge +#define RETVAL_NEG_HUGEL *(long double *)retval =(long double)*(float*)float_neg_huge + +#define RETVAL_HUGE_VALD *(double *)retval = *(double *) double_inf +#define RETVAL_NEG_HUGE_VALD *(double *)retval = *(double *) double_neg_inf +#define RETVAL_HUGED *(double *)retval = (double) *(float *)float_huge +#define RETVAL_NEG_HUGED *(double *)retval = (double) *(float *) float_neg_huge + +#define RETVAL_HUGE_VALF *(float *)retval = *(float *) float_inf +#define RETVAL_NEG_HUGE_VALF *(float *)retval = *(float *) float_neg_inf +#define RETVAL_HUGEF *(float *)retval = *(float *) float_huge +#define RETVAL_NEG_HUGEF *(float *)retval = *(float *) float_neg_huge + +#define RETVAL_ZEROL *(long double *)retval = *(long double *)long_double_zero +#define RETVAL_ZEROD *(double *)retval = *(double *)double_zero +#define RETVAL_ZEROF *(float *)retval = *(float *)float_zero + +#define RETVAL_NEG_ZEROL *(long double *)retval = *(long double *)long_double_neg_zero +#define RETVAL_NEG_ZEROD *(double *)retval = *(double *)double_neg_zero +#define RETVAL_NEG_ZEROF *(float *)retval = *(float *)float_neg_zero + +#define RETVAL_ONEL *(long double *)retval = (long double) 1.0 +#define RETVAL_ONED *(double *)retval = 1.0 +#define RETVAL_ONEF *(float *)retval = 1.0f + +#define NOT_MATHERRL excl.arg1=*(long double *)arg1;excl.arg2=*(long double *)arg2;excl.retval=*(long double *)retval;if(!matherrl(&excl)) +#define NOT_MATHERRD exc.arg1=*(double *)arg1;exc.arg2=*(double *)arg2;exc.retval=*(double *)retval;if(!MATHERR_D(&exc)) +#define NOT_MATHERRF excf.arg1=*(float *)arg1;excf.arg2=*(float *)arg2;excf.retval=*(float *)retval;if(!MATHERR_F(&excf)) + +#define ifSVID if(_LIB_VERSION==_SVID_) + +#define NAMEL excl.name +#define NAMED exc.name +#define NAMEF excf.name + +// +// These should work OK for MS because they are ints - +// leading underbars are not necessary. +// + +#define DOMAIN 1 +#define SING 2 +#define OVERFLOW 3 +#define UNDERFLOW 4 +#define TLOSS 5 +#define PLOSS 6 + +#define SINGL excl.type = SING +#define DOMAINL excl.type = DOMAIN +#define OVERFLOWL excl.type = OVERFLOW +#define UNDERFLOWL excl.type = UNDERFLOW +#define TLOSSL excl.type = TLOSS +#define SINGD exc.type = SING +#define DOMAIND exc.type = DOMAIN +#define OVERFLOWD exc.type = OVERFLOW +#define UNDERFLOWD exc.type = UNDERFLOW +#define TLOSSD exc.type = TLOSS +#define SINGF excf.type = SING +#define DOMAINF excf.type = DOMAIN +#define OVERFLOWF excf.type = OVERFLOW +#define UNDERFLOWF excf.type = UNDERFLOW +#define TLOSSF excf.type = TLOSS + +#define INPUT_XL (excl.arg1=*(long double*)arg1) +#define INPUT_XD (exc.arg1=*(double*)arg1) +#define INPUT_XF (excf.arg1=*(float*)arg1) +#define INPUT_YL (excl.arg1=*(long double*)arg2) +#define INPUT_YD (exc.arg1=*(double*)arg2) +#define INPUT_YF (excf.arg1=*(float*)arg2) +#define INPUT_RESL (*(long double *)retval) +#define INPUT_RESD (*(double *)retval) +#define INPUT_RESF (*(float *)retval) + +#define WRITEL_LOG_ZERO fputs("logl: SING error\n",stderr) +#define WRITED_LOG_ZERO fputs("log: SING error\n",stderr) +#define WRITEF_LOG_ZERO fputs("logf: SING error\n",stderr) +#define WRITEL_LOG_NEGATIVE fputs("logl: DOMAIN error\n",stderr) +#define WRITED_LOG_NEGATIVE fputs("log: DOMAIN error\n",stderr) +#define WRITEF_LOG_NEGATIVE fputs("logf: DOMAIN error\n",stderr) +#define WRITEL_Y0_ZERO fputs("y0l: DOMAIN error\n",stderr) +#define WRITED_Y0_ZERO fputs("y0: DOMAIN error\n",stderr) +#define WRITEF_Y0_ZERO fputs("y0f: DOMAIN error\n",stderr) +#define WRITEL_Y0_NEGATIVE fputs("y0l: DOMAIN error\n",stderr) +#define WRITED_Y0_NEGATIVE fputs("y0: DOMAIN error\n",stderr) +#define WRITEF_Y0_NEGATIVE fputs("y0f: DOMAIN error\n",stderr) +#define WRITEL_Y1_ZERO fputs("y1l: DOMAIN error\n",stderr) +#define WRITED_Y1_ZERO fputs("y1: DOMAIN error\n",stderr) +#define WRITEF_Y1_ZERO fputs("y1f: DOMAIN error\n",stderr) +#define WRITEL_Y1_NEGATIVE fputs("y1l: DOMAIN error\n",stderr) +#define WRITED_Y1_NEGATIUE fputs("y1: DOMAIN error\n",stderr) +#define WRITEF_Y1_NEGATIVE fputs("y1f: DOMAIN error\n",stderr) +#define WRITEL_YN_ZERO fputs("ynl: DOMAIN error\n",stderr) +#define WRITED_YN_ZERO fputs("yn: DOMAIN error\n",stderr) +#define WRITEF_YN_ZERO fputs("ynf: DOMAIN error\n",stderr) +#define WRITEL_YN_NEGATIVE fputs("ynl: DOMAIN error\n",stderr) +#define WRITED_YN_NEGATIVE fputs("yn: DOMAIN error\n",stderr) +#define WRITEF_YN_NEGATIVE fputs("ynf: DOMAIN error\n",stderr) +#define WRITEL_LOG1P_ZERO fputs("log1pl: SING error\n",stderr) +#define WRITED_LOG1P_ZERO fputs("log1p: SING error\n",stderr) +#define WRITEF_LOG1P_ZERO fputs("log1pf: SING error\n",stderr) +#define WRITEL_LOG1P_NEGATIVE fputs("log1pl: DOMAIN error\n",stderr) +#define WRITED_LOG1P_NEGATIVE fputs("log1p: DOMAIN error\n",stderr) +#define WRITEF_LOG1P_NEGATIVE fputs("log1pf: DOMAIN error\n",stderr) +#define WRITEL_LOG10_ZERO fputs("log10l: SING error\n",stderr) +#define WRITED_LOG10_ZERO fputs("log10: SING error\n",stderr) +#define WRITEF_LOG10_ZERO fputs("log10f: SING error\n",stderr) +#define WRITEL_LOG10_NEGATIVE fputs("log10l: DOMAIN error\n",stderr) +#define WRITED_LOG10_NEGATIVE fputs("log10: DOMAIN error\n",stderr) +#define WRITEF_LOG10_NEGATIVE fputs("log10f: DOMAIN error\n",stderr) +#define WRITEL_POW_ZERO_TO_ZERO fputs("powl(0,0): DOMAIN error\n",stderr) +#define WRITED_POW_ZERO_TO_ZERO fputs("pow(0,0): DOMAIN error\n",stderr) +#define WRITEF_POW_ZERO_TO_ZERO fputs("powf(0,0): DOMAIN error\n",stderr) +#define WRITEL_POW_ZERO_TO_NEGATIVE fputs("powl(0,negative): DOMAIN error\n",stderr) +#define WRITED_POW_ZERO_TO_NEGATIVE fputs("pow(0,negative): DOMAIN error\n",stderr) +#define WRITEF_POW_ZERO_TO_NEGATIVE fputs("powf(0,negative): DOMAIN error\n",stderr) +#define WRITEL_POW_NEG_TO_NON_INTEGER fputs("powl(negative,non-integer): DOMAIN error\n",stderr) +#define WRITED_POW_NEG_TO_NON_INTEGER fputs("pow(negative,non-integer): DOMAIN error\n",stderr) +#define WRITEF_POW_NEG_TO_NON_INTEGER fputs("powf(negative,non-integer): DOMAIN error\n",stderr) +#define WRITEL_ATAN2_ZERO_BY_ZERO fputs("atan2l: DOMAIN error\n",stderr) +#define WRITED_ATAN2_ZERO_BY_ZERO fputs("atan2: DOMAIN error\n",stderr) +#define WRITEF_ATAN2_ZERO_BY_ZERO fputs("atan2f: DOMAIN error\n",stderr) +#define WRITEL_SQRT fputs("sqrtl: DOMAIN error\n",stderr) +#define WRITED_SQRT fputs("sqrt: DOMAIN error\n",stderr) +#define WRITEF_SQRT fputs("sqrtf: DOMAIN error\n",stderr) +#define WRITEL_FMOD fputs("fmodl: DOMAIN error\n",stderr) +#define WRITED_FMOD fputs("fmod: DOMAIN error\n",stderr) +#define WRITEF_FMOD fputs("fmodf: DOMAIN error\n",stderr) +#define WRITEL_REM fputs("remainderl: DOMAIN error\n",stderr) +#define WRITED_REM fputs("remainder: DOMAIN error\n",stderr) +#define WRITEF_REM fputs("remainderf: DOMAIN error\n",stderr) +#define WRITEL_ACOS fputs("acosl: DOMAIN error\n",stderr) +#define WRITED_ACOS fputs("acos: DOMAIN error\n",stderr) +#define WRITEF_ACOS fputs("acosf: DOMAIN error\n",stderr) +#define WRITEL_ASIN fputs("asinl: DOMAIN error\n",stderr) +#define WRITED_ASIN fputs("asin: DOMAIN error\n",stderr) +#define WRITEF_ASIN fputs("asinf: DOMAIN error\n",stderr) +#define WRITEL_ACOSH fputs("acoshl: DOMAIN error\n",stderr) +#define WRITED_ACOSH fputs("acosh: DOMAIN error\n",stderr) +#define WRITEF_ACOSH fputs("acoshf: DOMAIN error\n",stderr) +#define WRITEL_ATANH_GT_ONE fputs("atanhl: DOMAIN error\n",stderr) +#define WRITED_ATANH_GT_ONE fputs("atanh: DOMAIN error\n",stderr) +#define WRITEF_ATANH_GT_ONE fputs("atanhf: DOMAIN error\n",stderr) +#define WRITEL_ATANH_EQ_ONE fputs("atanhl: SING error\n",stderr) +#define WRITED_ATANH_EQ_ONE fputs("atanh: SING error\n",stderr) +#define WRITEF_ATANH_EQ_ONE fputs("atanhf: SING error\n",stderr) +#define WRITEL_LGAMMA_NEGATIVE fputs("lgammal: SING error\n",stderr) +#define WRITED_LGAMMA_NEGATIVE fputs("lgamma: SING error\n",stderr) +#define WRITEF_LGAMMA_NEGATIVE fputs("lgammaf: SING error\n",stderr) +#define WRITEL_GAMMA_NEGATIVE fputs("gammal: SING error\n",stderr) +#define WRITED_GAMMA_NEGATIVE fputs("gamma: SING error\n",stderr) +#define WRITEF_GAMMA_NEGATIVE fputs("gammaf: SING error\n",stderr) +#define WRITEL_J0_TLOSS fputs("j0l: TLOSS error\n",stderr) +#define WRITEL_Y0_TLOSS fputs("y0l: TLOSS error\n",stderr) +#define WRITEL_J1_TLOSS fputs("j1l: TLOSS error\n",stderr) +#define WRITEL_Y1_TLOSS fputs("y1l: TLOSS error\n",stderr) +#define WRITEL_JN_TLOSS fputs("jnl: TLOSS error\n",stderr) +#define WRITEL_YN_TLOSS fputs("ynl: TLOSS error\n",stderr) +#define WRITED_J0_TLOSS fputs("j0: TLOSS error\n",stderr) +#define WRITED_Y0_TLOSS fputs("y0: TLOSS error\n",stderr) +#define WRITED_J1_TLOSS fputs("j1: TLOSS error\n",stderr) +#define WRITED_Y1_TLOSS fputs("y1: TLOSS error\n",stderr) +#define WRITED_JN_TLOSS fputs("jn: TLOSS error\n",stderr) +#define WRITED_YN_TLOSS fputs("yn: TLOSS error\n",stderr) +#define WRITEF_J0_TLOSS fputs("j0f: TLOSS error\n",stderr) +#define WRITEF_Y0_TLOSS fputs("y0f: TLOSS error\n",stderr) +#define WRITEF_J1_TLOSS fputs("j1f: TLOSS error\n",stderr) +#define WRITEF_Y1_TLOSS fputs("y1f: TLOSS error\n",stderr) +#define WRITEF_JN_TLOSS fputs("jnf: TLOSS error\n",stderr) +#define WRITEF_YN_TLOSS fputs("ynf: TLOSS error\n",stderr) + +/***********************/ +/* IEEE Path */ +/***********************/ +if(_LIB_VERSION==_IEEE_) return; + +/***********************/ +/* C9X Path */ +/***********************/ +else if(_LIB_VERSION==_ISOC_) +{ + switch(input_tag) + { + case logl_zero: + case log_zero: + case logf_zero: + case log10l_zero: + case log10_zero: + case log10f_zero: + case log2l_zero: + case log2_zero: + case log2f_zero: + case log1pl_zero: + case log1p_zero: + case log1pf_zero: + case powl_overflow: + case pow_overflow: + case powf_overflow: + case powl_underflow: + case pow_underflow: + case powf_underflow: + case expl_overflow: + case exp_overflow: + case expf_overflow: + case expl_underflow: + case exp_underflow: + case expf_underflow: + case exp2l_overflow: + case exp2_overflow: + case exp2f_overflow: + case exp2l_underflow: + case exp2_underflow: + case exp2f_underflow: + case exp10l_overflow: + case exp10_overflow: + case exp10f_overflow: + case expm1l_overflow: + case expm1_overflow: + case expm1f_overflow: + case hypotl_overflow: + case hypot_overflow: + case hypotf_overflow: + case sinhl_overflow: + case sinh_overflow: + case sinhf_overflow: + case atanhl_eq_one: + case atanh_eq_one: + case atanhf_eq_one: + case scalbl_overflow: + case scalb_overflow: + case scalbf_overflow: + case scalbl_underflow: + case scalb_underflow: + case scalbf_underflow: + case coshl_overflow: + case cosh_overflow: + case coshf_overflow: + case nextafterl_overflow: + case nextafter_overflow: + case nextafterf_overflow: + case scalbnl_overflow: + case scalbn_overflow: + case scalbnf_overflow: + case scalbnl_underflow: + case scalbn_underflow: + case scalbnf_underflow: + case ldexpl_overflow: + case ldexp_overflow: + case ldexpf_overflow: + case ldexpl_underflow: + case ldexp_underflow: + case ldexpf_underflow: + case lgammal_overflow: + case lgamma_overflow: + case lgammaf_overflow: + case lgammal_negative: + case lgamma_negative: + case lgammaf_negative: + case gammal_overflow: + case gamma_overflow: + case gammaf_overflow: + case gammal_negative: + case gamma_negative: + case gammaf_negative: + case ilogbl_zero: + case ilogb_zero: + case ilogbf_zero: + { + ERRNO_RANGE; break; + } + case logl_negative: + case log_negative: + case logf_negative: + case log10l_negative: + case log10_negative: + case log10f_negative: + case log2l_negative: + case log2_negative: + case log2f_negative: + case log1pl_negative: + case log1p_negative: + case log1pf_negative: + case sqrtl_negative: + case sqrt_negative: + case sqrtf_negative: + case atan2l_zero: + case atan2_zero: + case atan2f_zero: + case powl_zero_to_negative: + case powl_neg_to_non_integer: + case pow_zero_to_negative: + case pow_neg_to_non_integer: + case powf_zero_to_negative: + case powf_neg_to_non_integer: + case fmodl_by_zero: + case fmod_by_zero: + case fmodf_by_zero: + case atanhl_gt_one: + case atanh_gt_one: + case atanhf_gt_one: + case acosl_gt_one: + case acos_gt_one: + case acosf_gt_one: + case asinl_gt_one: + case asin_gt_one: + case asinf_gt_one: + case logbl_zero: + case logb_zero: + case logbf_zero: + case acoshl_lt_one: + case acosh_lt_one: + case acoshf_lt_one: + case y0l_zero: + case y0_zero: + case y0f_zero: + case y1l_zero: + case y1_zero: + case y1f_zero: + case ynl_zero: + case yn_zero: + case ynf_zero: + case y0l_negative: + case y0_negative: + case y0f_negative: + case y1l_negative: + case y1_negative: + case y1f_negative: + case ynl_negative: + case yn_negative: + case ynf_negative: + { + ERRNO_DOMAIN; break; + } + default: + abort(); + } + return; +} + +/***********************/ +/* _POSIX_ Path */ +/***********************/ + +else if(_LIB_VERSION==_POSIX_) +{ +switch(input_tag) + { + case gammal_overflow: + case lgammal_overflow: + { + RETVAL_HUGE_VALL; ERRNO_RANGE; break; + } + case gamma_overflow: + case lgamma_overflow: + { + RETVAL_HUGE_VALD; ERRNO_RANGE; break; + } + case gammaf_overflow: + case lgammaf_overflow: + { + RETVAL_HUGE_VALF; ERRNO_RANGE; break; + } + case gammal_negative: + case gamma_negative: + case gammaf_negative: + case lgammal_negative: + case lgamma_negative: + case lgammaf_negative: + { + ERRNO_DOMAIN; break; + } + case ldexpl_overflow: + case ldexpl_underflow: + case ldexp_overflow: + case ldexp_underflow: + case ldexpf_overflow: + case ldexpf_underflow: + case scalbnl_overflow: + case scalbnl_underflow: + case scalbn_overflow: + case scalbn_underflow: + case scalbnf_overflow: + case scalbnf_underflow: + { + ERRNO_RANGE; break; + } + case atanhl_gt_one: + case atanhl_eq_one: + /* atanhl(|x| >= 1) */ + { + ERRNO_DOMAIN; break; + } + case atanh_gt_one: + case atanh_eq_one: + /* atanh(|x| >= 1) */ + { + ERRNO_DOMAIN; break; + } + case atanhf_gt_one: + case atanhf_eq_one: + /* atanhf(|x| >= 1) */ + { + ERRNO_DOMAIN; break; + } + case sqrtl_negative: + /* sqrtl(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case sqrt_negative: + /* sqrt(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case sqrtf_negative: + /* sqrtf(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case y0l_zero: + case y1l_zero: + case ynl_zero: + /* y0l(0) */ + /* y1l(0) */ + /* ynl(0) */ + { + RETVAL_NEG_HUGE_VALL; ERRNO_DOMAIN; break; + } + case y0_zero: + case y1_zero: + case yn_zero: + /* y0(0) */ + /* y1(0) */ + /* yn(0) */ + { + RETVAL_NEG_HUGE_VALD; ERRNO_DOMAIN; break; + } + case y0f_zero: + case y1f_zero: + case ynf_zero: + /* y0f(0) */ + /* y1f(0) */ + /* ynf(0) */ + { + RETVAL_NEG_HUGE_VALF; ERRNO_DOMAIN; break; + } + case y0l_negative: + case y1l_negative: + case ynl_negative: + /* y0l(x < 0) */ + /* y1l(x < 0) */ + /* ynl(x < 0) */ + { + RETVAL_NEG_HUGE_VALL; ERRNO_DOMAIN; break; + } + case y0_negative: + case y1_negative: + case yn_negative: + /* y0(x < 0) */ + /* y1(x < 0) */ + /* yn(x < 0) */ + { + RETVAL_NEG_HUGE_VALD; ERRNO_DOMAIN; break; + } + case y0f_negative: + case y1f_negative: + case ynf_negative: + /* y0f(x < 0) */ + /* y1f(x < 0) */ + /* ynf(x < 0) */ + { + RETVAL_NEG_HUGE_VALF; ERRNO_DOMAIN; break; + } + case logl_zero: + case log1pl_zero: + case log10l_zero: + /* logl(0) */ + /* log1pl(0) */ + /* log10l(0) */ + { + RETVAL_NEG_HUGE_VALL; ERRNO_RANGE; break; + } + case log_zero: + case log1p_zero: + case log10_zero: + case log2l_zero: + /* log(0) */ + /* log1p(0) */ + /* log10(0) */ + { + RETVAL_NEG_HUGE_VALD; ERRNO_RANGE; break; + } + case logf_zero: + case log1pf_zero: + case log10f_zero: + /* logf(0) */ + /* log1pf(0) */ + /* log10f(0) */ + { + RETVAL_NEG_HUGE_VALF; ERRNO_RANGE; break; + } + case logl_negative: + case log1pl_negative: + case log10l_negative: + case log2l_negative: + /* logl(x < 0) */ + /* log1pl(x < 0) */ + /* log10l(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case log_negative: + case log1p_negative: + case log10_negative: + case log2_negative: + /* log(x < 0) */ + /* log1p(x < 0) */ + /* log10(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case logf_negative: + case log1pf_negative: + case log10f_negative: + case log2f_negative: + /* logf(x < 0) */ + /* log1pf(x < 0) */ + /* log10f(x < 0) */ + { + ERRNO_DOMAIN; break; + } + case expl_overflow: + /* expl overflow */ + { + RETVAL_HUGE_VALL; ERRNO_RANGE; break; + } + case exp_overflow: + /* exp overflow */ + { + RETVAL_HUGE_VALD; ERRNO_RANGE; break; + } + case expf_overflow: + /* expf overflow */ + { + RETVAL_HUGE_VALF; ERRNO_RANGE; break; + } + case expl_underflow: + /* expl underflow */ + { + RETVAL_ZEROL; ERRNO_RANGE; break; + } + case exp_underflow: + /* exp underflow */ + { + RETVAL_ZEROD; ERRNO_RANGE; break; + } + case expf_underflow: + /* expf underflow */ + { + RETVAL_ZEROF; ERRNO_RANGE; break; + } + case j0l_gt_loss: + case y0l_gt_loss: + case j1l_gt_loss: + case y1l_gt_loss: + case jnl_gt_loss: + case ynl_gt_loss: + /* jn and yn doubl-extended> XLOSS */ + { + RETVAL_ZEROL; ERRNO_RANGE; break; + } + case j0_gt_loss: + case y0_gt_loss: + case j1_gt_loss: + case y1_gt_loss: + case jn_gt_loss: + case yn_gt_loss: + /* jn and yn double > XLOSS */ + { + RETVAL_ZEROD; ERRNO_RANGE; break; + } + case j0f_gt_loss: + case y0f_gt_loss: + case j1f_gt_loss: + case y1f_gt_loss: + case jnf_gt_loss: + case ynf_gt_loss: + /* j0n and y0n > XLOSS */ + { + RETVAL_ZEROF; ERRNO_RANGE; break; + } + case powl_zero_to_zero: + /* powl 0**0 */ + { + break; + } + case pow_zero_to_zero: + /* pow 0**0 */ + { + break; + } + case powf_zero_to_zero: + /* powf 0**0 */ + { + break; + } + case powl_overflow: + /* powl(x,y) overflow */ + { + if (INPUT_RESL < 0) RETVAL_NEG_HUGE_VALL; + else RETVAL_HUGE_VALL; + ERRNO_RANGE; break; + } + case pow_overflow: + /* pow(x,y) overflow */ + { + if (INPUT_RESD < 0) RETVAL_NEG_HUGE_VALD; + else RETVAL_HUGE_VALD; + ERRNO_RANGE; break; + } + case powf_overflow: + /* powf(x,y) overflow */ + { + if (INPUT_RESF < 0) RETVAL_NEG_HUGE_VALF; + else RETVAL_HUGE_VALF; + ERRNO_RANGE; break; + } + case powl_underflow: + /* powl(x,y) underflow */ + { + RETVAL_ZEROL; ERRNO_RANGE; break; + } + case pow_underflow: + /* pow(x,y) underflow */ + { + RETVAL_ZEROD; ERRNO_RANGE; break; + } + case powf_underflow: + /* powf(x,y) underflow */ + { + RETVAL_ZEROF; ERRNO_RANGE; break; + } + case powl_zero_to_negative: + /* 0**neg */ + { + ERRNO_DOMAIN; break; + } + case pow_zero_to_negative: + /* 0**neg */ + { + ERRNO_DOMAIN; break; + } + case powf_zero_to_negative: + /* 0**neg */ + { + ERRNO_DOMAIN; break; + } + case powl_neg_to_non_integer: + /* neg**non_integral */ + { + ERRNO_DOMAIN; break; + } + case pow_neg_to_non_integer: + /* neg**non_integral */ + { + ERRNO_DOMAIN; break; + } + case powf_neg_to_non_integer: + /* neg**non-integral */ + { + ERRNO_DOMAIN; break; + } + case powl_nan_to_zero: + /* powl(NaN,0.0) */ + /* Special Error */ + { + break; + } + case pow_nan_to_zero: + /* pow(NaN,0.0) */ + { + break; + } + case powf_nan_to_zero: + /* powf(NaN,0.0) */ + { + break; + } + case atan2l_zero: + /* atan2l(0,0) */ + { + /* XXX arg1 and arg2 are switched!!!! */ + if (signbit (*(long double *) arg1)) + /* y == -0 */ + *(long double *) retval = copysignl (M_PIl, *(long double *) arg2); + else + *(long double *) retval = *(long double *) arg2; + ERRNO_DOMAIN; break; + } + case atan2_zero: + /* atan2(0,0) */ + { + /* XXX arg1 and arg2 are switched!!!! */ + if (signbit (*(double *) arg1)) + /* y == -0 */ + *(double *) retval = copysign (M_PI, *(double *) arg2); + else + *(double *) retval = *(double *) arg2; + ERRNO_DOMAIN; break; + } + case + atan2f_zero: + /* atan2f(0,0) */ + { + if (signbit (*(float *) arg2)) + /* y == -0 */ + *(float *) retval = copysignf (M_PI, *(float *) arg1); + else + *(float *) retval = *(float *) arg1; + ERRNO_DOMAIN; break; + } + case expm1l_overflow: + /* expm1 overflow */ + { + ERRNO_RANGE; break; + } + case expm1_overflow: + /* expm1 overflow */ + { + ERRNO_RANGE; break; + } + case expm1f_overflow: + /* expm1f overflow */ + { + ERRNO_RANGE; break; + } + case expm1l_underflow: + /* expm1 underflow */ + { + ERRNO_RANGE; break; + } + case expm1_underflow: + /* expm1 underflow */ + { + ERRNO_RANGE; break; + } + case expm1f_underflow: + /* expm1f underflow */ + { + ERRNO_RANGE; break; + } + case hypotl_overflow: + /* hypotl overflow */ + { + RETVAL_HUGE_VALL; ERRNO_RANGE; break; + } + case hypot_overflow: + /* hypot overflow */ + { + RETVAL_HUGE_VALD; ERRNO_RANGE; break; + } + case hypotf_overflow: + /* hypotf overflow */ + { + RETVAL_HUGE_VALF; ERRNO_RANGE; break; + } + case scalbl_underflow: + /* scalbl underflow */ + { + if (INPUT_XL < 0) RETVAL_NEG_ZEROL; + else RETVAL_ZEROL; + ERRNO_RANGE; break; + } + case scalb_underflow: + /* scalb underflow */ + { + if (INPUT_XD < 0) RETVAL_NEG_ZEROD; + else RETVAL_ZEROD; + ERRNO_RANGE; break; + } + case scalbf_underflow: + /* scalbf underflow */ + { + if (INPUT_XF < 0) RETVAL_NEG_ZEROF; + else RETVAL_ZEROF; + ERRNO_RANGE; break; + } + case scalbl_overflow: + /* scalbl overflow */ + { + if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; + else RETVAL_HUGE_VALL; + ERRNO_RANGE; break; + } + case scalb_overflow: + /* scalb overflow */ + { + if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; + else RETVAL_HUGE_VALD; + ERRNO_RANGE; break; + } + case scalbf_overflow: + /* scalbf overflow */ + { + if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; + else RETVAL_HUGE_VALF; + ERRNO_RANGE; break; + } + case acoshl_lt_one: + /* acoshl(x < 1) */ + { + ERRNO_DOMAIN; break; + } + case acosh_lt_one: + /* acosh(x < 1) */ + { + ERRNO_DOMAIN; break; + } + case acoshf_lt_one: + /* acoshf(x < 1) */ + { + ERRNO_DOMAIN; break; + } + case acosl_gt_one: + /* acosl(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case acos_gt_one: + /* acos(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case acosf_gt_one: + /* acosf(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case asinl_gt_one: + /* asinl(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case asin_gt_one: + /* asin(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case asinf_gt_one: + /* asinf(x > 1) */ + { + ERRNO_DOMAIN; break; + } + case remainderl_by_zero: + case fmodl_by_zero: + /* fmodl(x,0) */ + { + ERRNO_DOMAIN; break; + } + case remainder_by_zero: + case fmod_by_zero: + /* fmod(x,0) */ + { + ERRNO_DOMAIN; break; + } + case remainderf_by_zero: + case fmodf_by_zero: + /* fmodf(x,0) */ + { + ERRNO_DOMAIN; break; + } + case coshl_overflow: + /* coshl overflows */ + { + RETVAL_HUGE_VALL; ERRNO_RANGE; break; + } + case cosh_overflow: + /* cosh overflows */ + { + RETVAL_HUGE_VALD; ERRNO_RANGE; break; + } + case coshf_overflow: + /* coshf overflows */ + { + RETVAL_HUGE_VALF; ERRNO_RANGE; break; + } + case sinhl_overflow: + /* sinhl overflows */ + { + if (INPUT_XL > 0) RETVAL_HUGE_VALL; + else RETVAL_NEG_HUGE_VALL; + ERRNO_RANGE; break; + } + case sinh_overflow: + /* sinh overflows */ + { + if (INPUT_XD > 0) RETVAL_HUGE_VALD; + else RETVAL_NEG_HUGE_VALD; + ERRNO_RANGE; break; + } + case sinhf_overflow: + /* sinhf overflows */ + { + if (INPUT_XF > 0) RETVAL_HUGE_VALF; + else RETVAL_NEG_HUGE_VALF; + ERRNO_RANGE; break; + } + case logbl_zero: + /* logbl(0) */ + { + ERRNO_DOMAIN; break; + } + case logb_zero: + /* logb(0) */ + { + ERRNO_DOMAIN; break; + } + case logbf_zero: + /* logbf(0) */ + { + ERRNO_DOMAIN; break; + } + case ilogbl_zero: + /* ilogbl(0) */ + { + ERRNO_RANGE; break; + } + case ilogb_zero: + /* ilogb(0) */ + { + ERRNO_RANGE; break; + } + case ilogbf_zero: + /* ilogbf(0) */ + { + ERRNO_RANGE; break; + } + default: + abort(); +} +return; +/* _POSIX_ */ +} + +/*******************************/ +/* __SVID__ and __XOPEN__ Path */ +/*******************************/ +else +{ + switch(input_tag) + { + case ldexpl_overflow: + case ldexpl_underflow: + case ldexp_overflow: + case ldexp_underflow: + case ldexpf_overflow: + case ldexpf_underflow: + case scalbnl_overflow: + case scalbnl_underflow: + case scalbn_overflow: + case scalbn_underflow: + case scalbnf_overflow: + case scalbnf_underflow: + { + ERRNO_RANGE; break; + } + case sqrtl_negative: + /* sqrtl(x < 0) */ + { + DOMAINL; NAMEL = (char *) "sqrtl"; + ifSVID + { + RETVAL_ZEROL; + NOT_MATHERRL + { + WRITEL_SQRT; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case sqrt_negative: + /* sqrt(x < 0) */ + { + DOMAIND; NAMED = (char *) "sqrt"; + ifSVID + { + + RETVAL_ZEROD; + NOT_MATHERRD + { + WRITED_SQRT; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case sqrtf_negative: + /* sqrtf(x < 0) */ + { + DOMAINF; NAMEF = (char *) "sqrtf"; + ifSVID + { + RETVAL_ZEROF; + NOT_MATHERRF + { + WRITEF_SQRT; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case logl_zero: + case log2l_zero: + /* logl(0) */ + { + SINGL; NAMEL = (char *) "logl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log_zero: + case log2_zero: + /* log(0) */ + { + SINGD; NAMED = (char *) "log"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case logf_zero: + case log2f_zero: + /* logf(0) */ + { + SINGF; NAMEF = (char *) "logf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + + case logl_negative: + case log2l_negative: + /* logl(x < 0) */ + { + DOMAINL; NAMEL = (char *) "logl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log_negative: + case log2_negative: + /* log(x < 0) */ + { + DOMAIND; NAMED = (char *) "log"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case logf_negative: + case log2f_negative: + /* logf(x < 0) */ + { + DOMAINF; NAMEF = (char *) "logf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF{ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case log1pl_zero: + /* log1pl(-1) */ + { + SINGL; NAMEL = (char *) "log1pl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG1P_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log1p_zero: + /* log1p(-1) */ + { + SINGD; NAMED = (char *) "log1p"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG1P_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log1pf_zero: + /* log1pf(-1) */ + { + SINGF; NAMEF = (char *) "log1pf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG1P_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {}ERRNO_DOMAIN; + } + *(float *)retval = excf.retval; + break; + } + case log1pl_negative: + /* log1pl(x < -1) */ + { + DOMAINL; NAMEL = (char *) "log1pl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG1P_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log1p_negative: + /* log1p(x < -1) */ + { + DOMAIND; NAMED = (char *) "log1p"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG1P_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log1pf_negative: + /* log1pf(x < -1) */ + { + DOMAINF; NAMEF = (char *) "log1pf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG1P_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case log10l_zero: + /* log10l(0) */ + { + SINGL; NAMEL = (char *) "log10l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG10_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log10_zero: + /* log10(0) */ + { + SINGD; NAMED = (char *) "log10"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG10_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log10f_zero: + /* log10f(0) */ + { + SINGF; NAMEF = (char *) "log10f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG10_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case log10l_negative: + /* log10l(x < 0) */ + { + DOMAINL; NAMEL = (char *) "log10l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG10_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log10_negative: + /* log10(x < 0) */ + { + DOMAIND; NAMED = (char *) "log10"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG10_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log10f_negative: + /* log10f(x < 0) */ + { + DOMAINF; NAMEF = (char *) "log10f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG10_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case expl_overflow: + /* expl overflow */ + { + OVERFLOWL; NAMEL = (char *) "expl"; + ifSVID + { + RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case exp_overflow: + /* exp overflow */ + { + OVERFLOWD; NAMED = (char *) "exp"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case expf_overflow: + /* expf overflow */ + { + OVERFLOWF; NAMEF = (char *) "expf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case expl_underflow: + /* expl underflow */ + { + UNDERFLOWL; NAMEL = (char *) "expl"; RETVAL_ZEROL; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case exp_underflow: + /* exp underflow */ + { + UNDERFLOWD; NAMED = (char *) "exp"; RETVAL_ZEROD; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case expf_underflow: + /* expf underflow */ + { + UNDERFLOWF; NAMEF = (char *) "expf"; RETVAL_ZEROF; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case powl_zero_to_zero: + /* powl 0**0 */ + { + DOMAINL; NAMEL = (char *) "powl"; + ifSVID + { + RETVAL_ZEROL; + NOT_MATHERRL + { + WRITEL_POW_ZERO_TO_ZERO; + ERRNO_RANGE; + } + *(long double *)retval = excl.retval; + } + else RETVAL_ONEL; + break; + } + case pow_zero_to_zero: + /* pow 0**0 */ + { + DOMAIND; NAMED = (char *) "pow"; + ifSVID + { + RETVAL_ZEROD; + NOT_MATHERRD + { + WRITED_POW_ZERO_TO_ZERO; + ERRNO_RANGE; + } + *(double *)retval = exc.retval; + } + else RETVAL_ONED; + break; + } + case powf_zero_to_zero: + /* powf 0**0 */ + { + DOMAINF; NAMEF = (char *) "powf"; + ifSVID + { + RETVAL_ZEROF; + NOT_MATHERRF + { + WRITEF_POW_ZERO_TO_ZERO; + ERRNO_RANGE; + } + *(float *)retval = excf.retval; + } + else RETVAL_ONEF; + break; + } + case powl_overflow: + /* powl(x,y) overflow */ + { + OVERFLOWL; NAMEL = (char *) "powl"; + ifSVID + { + if (INPUT_XL < 0) RETVAL_NEG_HUGEL; + else RETVAL_HUGEL; + } + else + { + if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; + else RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case pow_overflow: + /* pow(x,y) overflow */ + { + OVERFLOWD; NAMED = (char *) "pow"; + ifSVID + { + if (INPUT_XD < 0) RETVAL_NEG_HUGED; + else RETVAL_HUGED; + } + else + { + if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; + else RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case powf_overflow: + /* powf(x,y) overflow */ + { + OVERFLOWF; NAMEF = (char *) "powf"; + ifSVID + { + if (INPUT_XF < 0) RETVAL_NEG_HUGEF; + else RETVAL_HUGEF; + } + else + { + if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; + else RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case powl_underflow: + /* powl(x,y) underflow */ + { + UNDERFLOWL; NAMEL = (char *) "powl"; RETVAL_ZEROL; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case pow_underflow: + /* pow(x,y) underflow */ + { + UNDERFLOWD; NAMED = (char *) "pow"; RETVAL_ZEROD; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case powf_underflow: + /* powf(x,y) underflow */ + { + UNDERFLOWF; NAMEF = (char *) "powf"; RETVAL_ZEROF; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case powl_zero_to_negative: + /* 0 to neg */ + { + DOMAINL; NAMEL = (char *) "powl"; + ifSVID + { + RETVAL_ZEROL; + NOT_MATHERRL + { + WRITEL_POW_ZERO_TO_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case pow_zero_to_negative: + /* 0**neg */ + { + DOMAIND; NAMED = (char *) "pow"; + ifSVID + { + RETVAL_ZEROD; + NOT_MATHERRD + { + WRITED_POW_ZERO_TO_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case powf_zero_to_negative: + /* 0**neg */ + { + DOMAINF; NAMEF = (char *) "powf"; + RETVAL_NEG_HUGE_VALF; + ifSVID + { + RETVAL_ZEROF; + NOT_MATHERRF + { + WRITEF_POW_ZERO_TO_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case powl_neg_to_non_integer: + /* neg**non_integral */ + { + DOMAINL; NAMEL = (char *) "powl"; + ifSVID + { + RETVAL_ZEROF; + NOT_MATHERRL + { + WRITEL_POW_NEG_TO_NON_INTEGER; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case pow_neg_to_non_integer: + /* neg**non_integral */ + { + DOMAIND; NAMED = (char *) "pow"; + ifSVID + { + RETVAL_ZEROD; + NOT_MATHERRD + { + WRITED_POW_NEG_TO_NON_INTEGER; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case powf_neg_to_non_integer: + /* neg**non-integral */ + { + DOMAINF; NAMEF = (char *) "powf"; + ifSVID + { + RETVAL_ZEROF; + NOT_MATHERRF + { + WRITEF_POW_NEG_TO_NON_INTEGER; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case powl_nan_to_zero: + /* pow(NaN,0.0) */ + /* Special Error */ + { + DOMAINL; NAMEL = (char *) "powl"; INPUT_XL; INPUT_YL; + excl.retval = *(long double *)arg1; + NOT_MATHERRL {ERRNO_DOMAIN;} + *(long double *)retval = excl.retval; + break; + } + case pow_nan_to_zero: + /* pow(NaN,0.0) */ + /* Special Error */ + { + DOMAIND; NAMED = (char *) "pow"; INPUT_XD; INPUT_YD; + exc.retval = *(double *)arg1; + NOT_MATHERRD {ERRNO_DOMAIN;} + *(double *)retval = exc.retval; + break; + } + case powf_nan_to_zero: + /* powf(NaN,0.0) */ + /* Special Error */ + { + DOMAINF; NAMEF = (char *) "powf"; INPUT_XF; INPUT_YF; + excf.retval = *(float *)arg1; + NOT_MATHERRF {ERRNO_DOMAIN;} + *(float *)retval = excf.retval; + break; + } + case atan2l_zero: + /* atan2l(0.0,0.0) */ + { + DOMAINL; NAMEL = (char *) "atan2l"; + RETVAL_ZEROL; + NOT_MATHERRL + { + ifSVID + { + WRITEL_ATAN2_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + } + *(long double *)retval = excl.retval; + break; + } + case atan2_zero: + /* atan2(0.0,0.0) */ + { + DOMAIND; NAMED = (char *) "atan2"; + RETVAL_ZEROD; + NOT_MATHERRD + { + ifSVID + { + WRITED_ATAN2_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + } + *(double *)retval = exc.retval; + break; + } + case atan2f_zero: + /* atan2f(0.0,0.0) */ + { + DOMAINF; NAMEF = (char *) "atan2f"; + RETVAL_ZEROF; + NOT_MATHERRF + ifSVID + { + WRITEF_ATAN2_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + *(float *)retval = excf.retval; + break; + } + case expm1_overflow: + /* expm1(finite) overflow */ + /* Overflow is the only documented */ + /* special value. */ + { + ERRNO_RANGE; + break; + } + case expm1f_overflow: + /* expm1f(finite) overflow */ + { + ERRNO_RANGE; + break; + } + case expm1_underflow: + /* expm1(finite) underflow */ + /* Underflow is not documented */ + /* special value. */ + { + ERRNO_RANGE; + break; + } + case expm1f_underflow: + /* expm1f(finite) underflow */ + { + ERRNO_RANGE; + break; + } + case scalbl_underflow: + /* scalbl underflow */ + { + UNDERFLOWL; NAMEL = (char *) "scalbl"; + if (INPUT_XL < 0.0L) RETVAL_NEG_ZEROL; + else RETVAL_ZEROL; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excf.retval; + break; + } + case scalb_underflow: + /* scalb underflow */ + { + UNDERFLOWD; NAMED = (char *) "scalb"; + if (INPUT_XD < 0.0) RETVAL_NEG_ZEROD; + else RETVAL_ZEROD; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case scalbf_underflow: + /* scalbf underflow */ + { + UNDERFLOWF; NAMEF = (char *) "scalbf"; + if (INPUT_XF < 0.0) RETVAL_NEG_ZEROF; + else RETVAL_ZEROF; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case scalbl_overflow: + /* scalbl overflow */ + { + OVERFLOWL; NAMEL = (char *) "scalbl"; + if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; + else RETVAL_HUGE_VALL; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case scalb_overflow: + /* scalb overflow */ + { + OVERFLOWD; NAMED = (char *) "scalb"; + if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; + else RETVAL_HUGE_VALD; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case scalbf_overflow: + /* scalbf overflow */ + { + OVERFLOWF; NAMEF = (char *) "scalbf"; + if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; + else RETVAL_HUGE_VALF; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case hypotl_overflow: + /* hypotl overflow */ + { + OVERFLOWL; NAMEL = (char *) "hypotl"; + ifSVID + { + RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case hypot_overflow: + /* hypot overflow */ + { + OVERFLOWD; NAMED = (char *) "hypot"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case hypotf_overflow: + /* hypotf overflow */ + { + OVERFLOWF; NAMEF = (char *) "hypotf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case acosl_gt_one: + /* acosl(x > 1) */ + { + DOMAINL; NAMEL = (char *) "acosl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ACOS; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case acos_gt_one: + /* acos(x > 1) */ + { + DOMAIND; NAMED = (char *) "acos"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_ACOS; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case acosf_gt_one: + /* acosf(x > 1) */ + { + DOMAINF; NAMEF = (char *) "acosf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ACOS; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case asinl_gt_one: + /* asinl(x > 1) */ + { + DOMAINL; NAMEL = (char *) "asinl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ASIN; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case asin_gt_one: + /* asin(x > 1) */ + { + DOMAIND; NAMED = (char *) "asin"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_ASIN; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case asinf_gt_one: + /* asinf(x > 1) */ + { + DOMAINF; NAMEF = (char *) "asinf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ASIN; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case coshl_overflow: + /* coshl overflow */ + { + OVERFLOWL; NAMEL = (char *) "coshl"; + ifSVID + { + RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case cosh_overflow: + /* cosh overflow */ + { + OVERFLOWD; NAMED = (char *) "cosh"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case coshf_overflow: + /* coshf overflow */ + { + OVERFLOWF; NAMEF = (char *) "coshf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case sinhl_overflow: + /* sinhl overflow */ + { + OVERFLOWL; NAMEL = (char *) "sinhl"; + ifSVID + { + if (INPUT_XL > 0.0) RETVAL_HUGEL; + else RETVAL_NEG_HUGEL; + } + else + { + if (INPUT_XL > 0.0) RETVAL_HUGE_VALL; + else RETVAL_NEG_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case sinh_overflow: + /* sinh overflow */ + { + OVERFLOWD; NAMED = (char *) "sinh"; + ifSVID + { + if (INPUT_XD > 0.0) RETVAL_HUGED; + else RETVAL_NEG_HUGED; + } + else + { + if (INPUT_XD > 0.0) RETVAL_HUGE_VALD; + else RETVAL_NEG_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case sinhf_overflow: + /* sinhf overflow */ + { + OVERFLOWF; NAMEF = (char *) "sinhf"; + ifSVID + { + if( INPUT_XF > 0.0) RETVAL_HUGEF; + else RETVAL_NEG_HUGEF; + } + else + { + if (INPUT_XF > 0.0) RETVAL_HUGE_VALF; + else RETVAL_NEG_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case acoshl_lt_one: + /* acoshl(x < 1) */ + { + DOMAINL; NAMEL = (char *) "acoshl"; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ACOSH; + ERRNO_DOMAIN; + } + } + else NOT_MATHERRL {ERRNO_DOMAIN;} + *(long double *)retval = excl.retval; + break; + } + case acosh_lt_one: + /* acosh(x < 1) */ + { + DOMAIND; NAMED = (char *) "acosh"; + ifSVID + { + NOT_MATHERRD + { + WRITEL_ACOSH; + ERRNO_DOMAIN; + } + } + else NOT_MATHERRD {ERRNO_DOMAIN;} + *(double *)retval = exc.retval; + break; + } + case acoshf_lt_one: + /* acoshf(x < 1) */ + { + DOMAINF; NAMEF = (char *) "acoshf"; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ACOSH; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + ERRNO_DOMAIN; break; + } + case atanhl_gt_one: + /* atanhl(|x| > 1) */ + { + DOMAINL; NAMEL = (char *) "atanhl"; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ATANH_GT_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + break; + } + case atanh_gt_one: + /* atanh(|x| > 1) */ + { + DOMAIND; NAMED = (char *) "atanh"; + ifSVID + { + NOT_MATHERRD + { + WRITED_ATANH_GT_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + break; + } + case atanhf_gt_one: + /* atanhf(|x| > 1) */ + { + DOMAINF; NAMEF = (char *) "atanhf"; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ATANH_GT_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + break; + } + case atanhl_eq_one: + /* atanhl(|x| == 1) */ + { + SINGL; NAMEL = (char *)"atanhl"; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ATANH_EQ_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + break; + } + case atanh_eq_one: + /* atanh(|x| == 1) */ + { + SINGD; NAMED = (char *) "atanh"; + ifSVID + { + NOT_MATHERRD + { + WRITED_ATANH_EQ_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + break; + } + case atanhf_eq_one: + /* atanhf(|x| == 1) */ + { + SINGF; NAMEF = (char *) "atanhf"; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ATANH_EQ_ONE; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + break; + } + case gammal_overflow: + /* gammal overflow */ + { + OVERFLOWL; NAMEL = (char *) "gammal"; + ifSVID + { + RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case gamma_overflow: + /* gamma overflow */ + { + OVERFLOWD; NAMED = (char *) "gamma"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case gammaf_overflow: + /* gammaf overflow */ + { + OVERFLOWF; NAMEF = (char *) "gammaf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case lgammal_overflow: + /* lgammal overflow */ + { + OVERFLOWL; NAMEL = (char *) "lgammal"; + ifSVID + { + RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; + break; + } + case lgamma_overflow: + /* lgamma overflow */ + { + OVERFLOWD; NAMED = (char *) "lgamma"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; + break; + } + case lgammaf_overflow: + /* lgammaf overflow */ + { + OVERFLOWF; NAMEF = (char *) "lgammaf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; + break; + } + case lgammal_negative: + /* lgammal -int or 0 */ + { + SINGL; NAMEL = (char *) "lgammal"; + ifSVID + { + RETVAL_HUGEL; + NOT_MATHERRL + { + WRITEL_LGAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case lgamma_negative: + /* lgamma -int or 0 */ + { + SINGD; NAMED = (char *) "lgamma"; + ifSVID + { + RETVAL_HUGED; + NOT_MATHERRD + { + WRITED_LGAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case lgammaf_negative: + /* lgammaf -int or 0 */ + { + SINGF; NAMEF = (char *) "lgammaf"; + ifSVID + { + RETVAL_HUGEF; + NOT_MATHERRF + { + WRITEF_LGAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case gammal_negative: + /* gammal -int or 0 */ + { + SINGL; NAMEL = (char *) "gammal"; + ifSVID + { + RETVAL_HUGEL; + NOT_MATHERRL + { + WRITEL_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case gamma_negative: + /* gamma -int or 0 */ + { + SINGD; NAMED = (char *) "gamma"; + ifSVID + { + RETVAL_HUGED; + NOT_MATHERRD + { + WRITED_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case gammaf_negative: + /* gammaf -int or 0 */ + { + SINGF; NAMEF = (char *) "gammaf"; + ifSVID + { + RETVAL_HUGEF; + NOT_MATHERRF + { + WRITEF_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case j0l_gt_loss: + /* j0l > loss */ + { + TLOSSL; NAMEL = (char *) "j0l"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_J0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case j0_gt_loss: + /* j0 > loss */ + { + TLOSSD; NAMED = (char *) "j0"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_J0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case j0f_gt_loss: + /* j0f > loss */ + { + TLOSSF; NAMEF = (char *) "j0f"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_J0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case j1l_gt_loss: + /* j1l > loss */ + { + TLOSSL; NAMEL = (char *) "j1l"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_J1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case j1_gt_loss: + /* j1 > loss */ + { + TLOSSD; NAMED = (char *) "j1"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_J1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case j1f_gt_loss: + /* j1f > loss */ + { + TLOSSF; NAMEF = (char *) "j1f"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_J1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case jnl_gt_loss: + /* jnl > loss */ + { + TLOSSL; NAMEL = (char *) "jnl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_JN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case jn_gt_loss: + /* jn > loss */ + { + TLOSSD; NAMED = (char *) "jn"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_JN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case jnf_gt_loss: + /* jnf > loss */ + { + TLOSSF; NAMEF = (char *) "jnf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_JN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case y0l_gt_loss: + /* y0l > loss */ + { + TLOSSL; NAMEL = (char *) "y0l"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_Y0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case y0_gt_loss: + /* y0 > loss */ + { + TLOSSD; NAMED = (char *) "y0"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_Y0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case y0f_gt_loss: + /* y0f > loss */ + { + TLOSSF; NAMEF = (char *) "y0f"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_Y0_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case y0l_zero: + /* y0l(0) */ + { + DOMAINL; NAMEL = (char *) "y0l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_Y0_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case y0_zero: + /* y0(0) */ + { + DOMAIND; NAMED = (char *) "y0"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_Y0_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case y0f_zero: + /* y0f(0) */ + { + DOMAINF; NAMEF = (char *) "y0f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_Y0_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case y1l_gt_loss: + /* y1l > loss */ + { + TLOSSL; NAMEL = (char *) "y1l"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_Y1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case y1_gt_loss: + /* y1 > loss */ + { + TLOSSD; NAMED = (char *) "y1"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_Y1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case y1f_gt_loss: + /* y1f > loss */ + { + TLOSSF; NAMEF = (char *) "y1f"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_Y1_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case y1l_zero: + /* y1l(0) */ + { + DOMAINL; NAMEL = (char *) "y1l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_Y1_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case y1_zero: + /* y1(0) */ + { + DOMAIND; NAMED = (char *) "y1"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_Y1_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case y1f_zero: + /* y1f(0) */ + { + DOMAINF; NAMEF = (char *) "y1f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_Y1_ZERO; + ERRNO_DOMAIN; + } + }else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case ynl_gt_loss: + /* ynl > loss */ + { + TLOSSL; NAMEL = (char *) "ynl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_YN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRL {ERRNO_RANGE;} + } + *(long double *)retval = excl.retval; + break; + } + case yn_gt_loss: + /* yn > loss */ + { + TLOSSD; NAMED = (char *) "yn"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_YN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRD {ERRNO_RANGE;} + } + *(double*)retval = exc.retval; + break; + } + case ynf_gt_loss: + /* ynf > loss */ + { + TLOSSF; NAMEF = (char *) "ynf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_YN_TLOSS; + ERRNO_RANGE; + } + } + else + { + NOT_MATHERRF {ERRNO_RANGE;} + } + *(float*)retval = excf.retval; + break; + } + case ynl_zero: + /* ynl(0) */ + { + DOMAINL; NAMEL = (char *) "ynl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_YN_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case yn_zero: + /* yn(0) */ + { + DOMAIND; NAMED = (char *) "yn"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_YN_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case ynf_zero: + /* ynf(0) */ + { + DOMAINF; NAMEF = (char *) "ynf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_YN_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case y0l_negative: + /* y0l(x<0) */ + { + DOMAINL; NAMEL = (char *) "y0l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_Y0_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case y0_negative: + /* y0(x<0) */ + { + DOMAIND; NAMED = (char *) "y0"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_Y0_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case y0f_negative: + /* y0f(x<0) */ + { + DOMAINF; NAMEF = (char *) "y0f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_Y0_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case y1l_negative: + /* y1l(x<0) */ + { + DOMAINL; NAMEL = (char *) "y1l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_Y1_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case y1_negative: + /* y1(x<0) */ + { + DOMAIND; NAMED = (char *) "y1"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_Y1_NEGATIUE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case y1f_negative: + /* y1f(x<0) */ + { + DOMAINF; NAMEF = (char *) "y1f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_Y1_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case ynl_negative: + /* ynl(x<0) */ + { + DOMAINL; NAMEL = (char *) "ynl"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_YN_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case yn_negative: + /* yn(x<0) */ + { + DOMAIND; NAMED = (char *) "yn"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_YN_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case ynf_negative: + /* ynf(x<0) */ + { + DOMAINF; NAMEF = (char *) "ynf"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_YN_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case fmodl_by_zero: + /* fmodl(x,0) */ + { + DOMAINL; NAMEL = (char *) "fmodl"; + ifSVID + { + *(long double *)retval = *(long double *)arg1; + NOT_MATHERRL + { + WRITEL_FMOD; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case fmod_by_zero: + /* fmod(x,0) */ + { + DOMAIND; NAMED = (char *) "fmod"; + ifSVID + { + *(double *)retval = *(double *)arg1; + NOT_MATHERRD + { + WRITED_FMOD; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case fmodf_by_zero: + /* fmodf(x,0) */ + { + DOMAINF; NAMEF = (char *) "fmodf"; + ifSVID + { + *(float *)retval = *(float *)arg1; + NOT_MATHERRF + { + WRITEF_FMOD; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case remainderl_by_zero: + /* remainderl(x,0) */ + { + DOMAINL; NAMEL = (char *) "remainderl"; + ifSVID + { + NOT_MATHERRL + { + WRITEL_REM; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case remainder_by_zero: + /* remainder(x,0) */ + { + DOMAIND; NAMED = (char *) "remainder"; + ifSVID + { + NOT_MATHERRD + { + WRITED_REM; + ERRNO_DOMAIN; + } + } + else + { /* NaN already computed */ + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case remainderf_by_zero: + /* remainderf(x,0) */ + { + DOMAINF; NAMEF = (char *) "remainderf"; + ifSVID + { + NOT_MATHERRF + { + WRITEF_REM; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + default: + abort(); + } + return; + } +} diff --git a/sysdeps/ia64/fpu/libm_frexp4.S b/sysdeps/ia64/fpu/libm_frexp4.S new file mode 100644 index 0000000000..f20a00b154 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_frexp4.S @@ -0,0 +1,185 @@ +.file "libm_frexp_4.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 3/20/00: Improved speed +// 6/01/00: Fixed bug when x a double-extended denormal +// 12/08/00 Corrected label on .endp +// +// API +//============================================================== +// double frexp(double x, int* y) +// double __libm_frexp_4(double x, int* y) +// where int* y is a 32-bit integer +// +// Overview of operation +//============================================================== +// break a floating point x number into fraction and an exponent +// The fraction is returned as a double +// The exponent is returned as an integer pointed to by y +// This is a true (not a biased exponent) but 0fffe is subtracted +// as a bias instead of 0xffff. This is because the fraction returned +// is between 0.5 and 1.0, not the expected IEEE range. +// +// The fraction is 0.5 <= fraction < 1.0 +// +// Registers used +//============================================================== +// +// general registers: +// r14 exponent bias for x negative +// r15 exponent bias for x positive +// r16 signexp of x +// r17 exponent mask +// r18 exponent of x +// r19 exponent result +// r20 signexp of 2^64 +// r32 on input contains the 64-bit IEEE double that is in f8 +// r33 on input pointer to 32-bit integer for exponent +// +// predicate registers: +// p6 set if x is Nan, zero, or infinity +// p7 set if x negative +// p8 set if x positive +// p9 set if x double-extended denormal +// +// floating-point registers: +// f8 input, output +// f9 normalized x +// f10 signexp for significand result for x positive +// f11 signexp for significand result for x negative +// f12 2^64 + +#include "libm_support.h" + +.align 32 +.global __libm_frexp_4# + +.section .text +.proc __libm_frexp_4# +.align 32 + +__libm_frexp_4: + +// Set signexp for significand result for x>0 +// If x is a NaN, zero, or infinity, return it. +// Put 0 in the int pointer. +// x NAN, ZERO, INFINITY? +// Set signexp for significand result for x<0 +{ .mfi +(p0) mov r15 = 0x0fffe +(p0) fclass.m.unc p6,p0 = f8, 0xe7 +(p0) mov r14 = 0x2fffe +} +// Form signexp of 2^64 in case x double-extended denormal +// Save the normalized value of input in f9 +// The normalization also sets fault flags and takes faults if necessary +{ .mfi +(p0) mov r20 = 0x1003f +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Move signexp for significand result for x>0 to FP reg +// Form 2^64 in case x double-extended denormal +{ .mmi +(p0) setf.exp f10 = r15 +(p0) setf.exp f12 = r20 + nop.i 999 ;; +} + +// Move signexp for significand result for x<0 to FP reg +// If x NAN, ZERO, INFINITY, set *y=0 as a 32-bit integer, and exit +{ .mmb +(p0) setf.exp f11 = r14 +(p6) st4 [r33] = r0 +(p6) br.ret.spnt b0 ;; +} + +// Form exponent mask +// p7 if x<0, else p8 +{ .mfi +(p0) mov r17 = 0x1ffff +(p0) fcmp.lt.unc p7,p8 = f8,f0 + nop.i 999 ;; +} + +// Test for fnorm(x) denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0x0b + nop.i 999 ;; +} + +// If x double-extended denormal add 64 to exponent bias for scaling +// If x double-extended denormal multiply x * 2^64 which is normal +{ .mfi +(p9) add r15 = 64, r15 +(p9) fmpy f9 = f9, f12 + nop.i 999 ;; +} + +// true exponent stored to int pointer +// the bias is treated as 0xfffe instead of +// normal 0xffff because we want the significand +// to be in the range <=0.5 sig < 1.0 +// Store the value of the exponent at the pointer in r33 + +// If x>0 form significand result +{ .mfi + nop.m 999 +(p8) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Get signexp of normalized x +// If x<0 form significand result +{ .mfi +(p0) getf.exp r16 = f9 +(p7) fmerge.se f8 = f11,f9 + nop.i 999 ;; +} + +// Get exp of normalized x +// Subtract off bias to get true exponent of x +{ .mmi +(p0) and r18 = r17,r16 ;; +(p0) sub r19 = r18,r15 + nop.i 999 ;; +} + +// Store int y as a 32-bit integer +// Make the value a double +{ .mfb +(p0) st4 [r33] = r19 +(p0) fnorm.d f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp __libm_frexp_4 +ASM_SIZE_DIRECTIVE(__libm_frexp_4) diff --git a/sysdeps/ia64/fpu/libm_frexp4f.S b/sysdeps/ia64/fpu/libm_frexp4f.S new file mode 100644 index 0000000000..d94ad095a0 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_frexp4f.S @@ -0,0 +1,185 @@ +.file "libm_frexp_4f.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 3/20/00: Improved speed +// 6/01/00: Fixed bug when x a double-extended denormal +// 12/08/00 Corrected label on .endp +// +// API +//============================================================== +// float frexp(float x, int* y) +// float __libm_frexp_4f(float x, int* y) +// where int* y is a 32-bit integer +// +// Overview of operation +//============================================================== +// break a floating point x number into fraction and an exponent +// The fraction is returned as a float +// The exponent is returned as an integer pointed to by y +// This is a true (not a biased exponent) but 0fffe is subtracted +// as a bias instead of 0xffff. This is because the fraction returned +// is between 0.5 and 1.0, not the expected IEEE range. +// +// The fraction is 0.5 <= fraction < 1.0 +// +// Registers used +//============================================================== + +// general registers: +// r14 exponent bias for x negative +// r15 exponent bias for x positive +// r16 signexp of x +// r17 exponent mask +// r18 exponent of x +// r19 exponent result +// r20 signexp of 2^64 +// r32 on input contains the 32-bit IEEE float that is in f8 +// r33 on input pointer to 32-bit integer for exponent + +// predicate registers: +// p6 set if x is Nan, zero, or infinity +// p7 set if x negative +// p8 set if x positive +// p9 set if x double-extended denormal + +// floating-point registers: +// f8 input, output +// f9 normalized x +// f10 signexp for significand result for x positive +// f11 signexp for significand result for x negative +// f12 2^64 + +#include "libm_support.h" + +.align 32 +.global __libm_frexp_4f# + +.section .text +.proc __libm_frexp_4f# +.align 32 + +__libm_frexp_4f: + +// Set signexp for significand result for x>0 +// If x is a NaN, zero, or infinity, return it. +// Put 0 in the int pointer. +// x NAN, ZERO, INFINITY? +// Set signexp for significand result for x<0 +{ .mfi +(p0) mov r15 = 0x0fffe +(p0) fclass.m.unc p6,p0 = f8, 0xe7 +(p0) mov r14 = 0x2fffe +} +// Form signexp of 2^64 in case x double-extended denormal +// Save the normalized value of input in f9 +// The normalization also sets fault flags and takes faults if necessary +{ .mfi +(p0) mov r20 = 0x1003f +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Move signexp for significand result for x>0 to FP reg +// Form 2^64 in case x double-extended denormal +{ .mmi +(p0) setf.exp f10 = r15 +(p0) setf.exp f12 = r20 + nop.i 999 ;; +} + +// Move signexp for significand result for x<0 to FP reg +// If x NAN, ZERO, INFINITY, set *y=0 as a 32-bit integer, and exit +{ .mmb +(p0) setf.exp f11 = r14 +(p6) st4 [r33] = r0 +(p6) br.ret.spnt b0 ;; +} + +// Form exponent mask +// p7 if x<0, else p8 +{ .mfi +(p0) mov r17 = 0x1ffff +(p0) fcmp.lt.unc p7,p8 = f8,f0 + nop.i 999 ;; +} + +// Test for fnorm(x) denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0x0b + nop.i 999 ;; +} + +// If x double-extended denormal add 64 to exponent bias for scaling +// If x double-extended denormal multiply x * 2^64 which is normal +{ .mfi +(p9) add r15 = 64, r15 +(p9) fmpy f9 = f9, f12 + nop.i 999 ;; +} + +// true exponent stored to int pointer +// the bias is treated as 0xfffe instead of +// normal 0xffff because we want the significand +// to be in the range <=0.5 sig < 1.0 +// Store the value of the exponent at the pointer in r33 + +// If x>0 form significand result +{ .mfi + nop.m 999 +(p8) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Get signexp of normalized x +// If x<0 form significand result +{ .mfi +(p0) getf.exp r16 = f9 +(p7) fmerge.se f8 = f11,f9 + nop.i 999 ;; +} + +// Get exp of normalized x +// Subtract off bias to get true exponent of x +{ .mmi +(p0) and r18 = r17,r16 ;; +(p0) sub r19 = r18,r15 + nop.i 999 ;; +} + +// Store int y as a 32-bit integer +// Make the value a float +{ .mfb +(p0) st4 [r33] = r19 +(p0) fnorm.s f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp __libm_frexp_4f +ASM_SIZE_DIRECTIVE(__libm_frexp_4f) diff --git a/sysdeps/ia64/fpu/libm_frexp4l.S b/sysdeps/ia64/fpu/libm_frexp4l.S new file mode 100644 index 0000000000..4dfd223704 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_frexp4l.S @@ -0,0 +1,184 @@ +.file "libm_frexp_4l.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 3/20/00: Initial version +// 6/01/00: Fixed bug when x a double-extended denormal +// 12/08/00 Corrected label on .endp +// +// API +//============================================================== +// long double frexpl(long double x, int* y) +// long double __libm_frexp_4l(long double x, int* y) +// where int* y is a 32-bit integer +// +// Overview of operation +//============================================================== +// break a floating point x number into fraction and an exponent +// The fraction is returned as a long double +// The exponent is returned as an integer pointed to by y +// This is a true (not a biased exponent) but 0fffe is subtracted +// as a bias instead of 0xffff. This is because the fraction returned +// is between 0.5 and 1.0, not the expected IEEE range. +// +// The fraction is 0.5 <= fraction < 1.0 +// +// Registers used +//============================================================== +// +// general registers: +// r14 exponent bias for x negative +// r15 exponent bias for x positive +// r16 signexp of x +// r17 exponent mask +// r18 exponent of x +// r19 exponent result +// r20 signexp of 2^64 +// r32-33 on input contains the 80-bit IEEE long double that is in f8 +// r34 on input pointer to 32-bit integer for exponent +// +// predicate registers: +// p6 set if x is Nan, zero, or infinity +// p7 set if x negative +// p8 set if x positive +// p9 set if x double-extended denormal +// +// floating-point registers: +// f8 input, output +// f9 normalized x +// f10 signexp for significand result for x positive +// f11 signexp for significand result for x negative +// f12 2^64 + +#include "libm_support.h" + +.align 32 +.global __libm_frexp_4l# + +.section .text +.proc __libm_frexp_4l# +.align 32 + +__libm_frexp_4l: + +// Set signexp for significand result for x>0 +// If x is a NaN, zero, or infinity, return it. +// Put 0 in the int pointer. +// x NAN, ZERO, INFINITY? +// Set signexp for significand result for x<0 +{ .mfi +(p0) mov r15 = 0x0fffe +(p0) fclass.m.unc p6,p0 = f8, 0xe7 +(p0) mov r14 = 0x2fffe +} +// Form signexp of 2^64 in case x double-extended denormal +// Save the normalized value of input in f9 +// The normalization also sets fault flags and takes faults if necessary +{ .mfi +(p0) mov r20 = 0x1003f +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Move signexp for significand result for x>0 to FP reg +// Form 2^64 in case x double-extended denormal +{ .mmi +(p0) setf.exp f10 = r15 +(p0) setf.exp f12 = r20 + nop.i 999 ;; +} + +// Move signexp for significand result for x<0 to FP reg +// If x NAN, ZERO, INFINITY, set *y=0 as a 32-bit integer, and exit +{ .mmb +(p0) setf.exp f11 = r14 +(p6) st4 [r34] = r0 +(p6) br.ret.spnt b0 ;; +} + +// Form exponent mask +// p7 if x<0, else p8 +{ .mfi +(p0) mov r17 = 0x1ffff +(p0) fcmp.lt.unc p7,p8 = f8,f0 + nop.i 999 ;; +} + +// Test for fnorm(x) denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9,p0 = f9, 0x0b + nop.i 999 ;; +} + +// If x double-extended denormal add 64 to exponent bias for scaling +// If x double-extended denormal multiply x * 2^64 which is normal +{ .mfi +(p9) add r15 = 64, r15 +(p9) fmpy f9 = f9, f12 + nop.i 999 ;; +} + +// true exponent stored to int pointer +// the bias is treated as 0xfffe instead of +// normal 0xffff because we want the significand +// to be in the range <=0.5 sig < 1.0 +// Store the value of the exponent at the pointer in r34 + +// If x>0 form significand result +{ .mfi + nop.m 999 +(p8) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Get signexp of normalized x +// If x<0 form significand result +{ .mfi +(p0) getf.exp r16 = f9 +(p7) fmerge.se f8 = f11,f9 + nop.i 999 ;; +} + +// Get exp of normalized x +// Subtract off bias to get true exponent of x +{ .mmi +(p0) and r18 = r17,r16 ;; +(p0) sub r19 = r18,r15 + nop.i 999 ;; +} + +// Store int y as a 32-bit integer +// Make the value a long double +{ .mfb +(p0) st4 [r34] = r19 +(p0) fnorm f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp __libm_frexp_4l +ASM_SIZE_DIRECTIVE(__libm_frexp_4l) diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S new file mode 100644 index 0000000000..fb04d36840 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_reduce.S @@ -0,0 +1,1527 @@ +.file "libm_reduce.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History: 02/02/00 Initial Version +// +// ********************************************************************* +// ********************************************************************* +// +// Function: __libm_pi_by_two_reduce(x) return r, c, and N where +// x = N * pi/4 + (r+c) , where |r+c| <= pi/4. +// This function is not designed to be used by the +// general user. +// +// ********************************************************************* +// +// Accuracy: Returns double-precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f32-f70 +// +// General Purpose Registers: +// r8 = return value N +// r32 = Address of x +// r33 = Address of where to place r and then c +// r34-r64 +// +// Predicate Registers: p6-p14 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// No condions should be raised. +// +// ********************************************************************* +// +// I. Introduction +// =============== +// +// For the forward trigonometric functions sin, cos, sincos, and +// tan, the original algorithms for IA 64 handle arguments up to +// 1 ulp less than 2^63 in magnitude. For double-extended arguments x, +// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where +// +// x is accurately approximated by +// 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4. +// CASE = 1 or 2. +// CASE is 1 unless |r_hi + r_lo| < 2^(-33). +// +// The exact value of K is not determined, but that information is +// not required in trigonometric function computations. +// +// We first assume the argument x in question satisfies x >= 2^(63). +// In particular, it is positive. Negative x can be handled by symmetry: +// +// -x is accurately approximated by +// -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4. +// +// The idea of the reduction is that +// +// x * 2/pi = N_big + N + f, |f| <= 1/2 +// +// Moreover, for double extended x, |f| >= 2^(-75). (This is an +// non-obvious fact found by enumeration using a special algorithm +// involving continued fraction.) The algorithm described below +// calculates N and an accurate approximation of f. +// +// Roughly speaking, an appropriate 256-bit (4 X 64) portion of +// 2/pi is multiplied with x to give the desired information. +// +// II. Representation of 2/PI +// ========================== +// +// The value of 2/pi in binary fixed-point is +// +// .101000101111100110...... +// +// We store 2/pi in a table, starting at the position corresponding +// to bit position 63 +// +// bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576 +// +// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X +// +// ^ +// |__ implied binary pt +// +// III. Algorithm +// ============== +// +// This describes the algorithm in the most natural way using +// unsigned interger multiplication. The implementation section +// describes how the integer arithmetic is simulated. +// +// STEP 0. Initialization +// ---------------------- +// +// Let the input argument x be +// +// x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383. +// +// The first crucial step is to fetch four 64-bit portions of 2/pi. +// To fulfill this goal, we calculate the bit position L of the +// beginning of these 256-bit quantity by +// +// L := 62 - m. +// +// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that +// the storage of 2/pi is adequate. +// +// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus: +// +// bit position L L-1 L-2 ... L-63 +// +// P_1 = b b b ... b +// +// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to +// bit positions L+2 and L+1. So, when each of the P_j is interpreted +// with appropriate scaling, we have +// +// 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small +// +// Note that P_big and P_small can be ignored. The reasons are as follow. +// First, consider P_big. If P_big = 0, we can certainly ignore it. +// Otherwise, P_big >= 2^(L+3). Now, +// +// P_big * ulp(x) >= 2^(L+3) * 2^(m-63) +// >= 2^(65-m + m-63 ) +// >= 2^2 +// +// Thus, P_big * x is an integer of the form 4*K. So +// +// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) +// + x*P_small*(pi/2). +// +// Hence, P_big*x corresponds to information that can be ignored for +// trigonometic function evaluation. +// +// Next, we must estimate the effect of ignoring P_small. The absolute +// error made by ignoring P_small is bounded by +// +// |P_small * x| <= ulp(P_4) * x +// <= 2^(L-255) * 2^(m+1) +// <= 2^(62-m-255 + m + 1) +// <= 2^(-192) +// +// Since for double-extended precision, x * 2/pi = integer + f, +// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring +// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable. +// +// Further note that if x is split into x_hi + x_lo where x_lo is the +// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then +// +// P_0 * x_hi +// +// is also an integer of the form 4*K; and thus can also be ignored. +// Let M := P_0 * x_lo which is a small integer. The main part of the +// calculation is really the multiplication of x with the four pieces +// P_1, P_2, P_3, and P_4. +// +// Unless the reduced argument is extremely small in magnitude, it +// suffices to carry out the multiplication of x with P_1, P_2, and +// P_3. x*P_4 will be carried out and added on as a correction only +// when it is found to be needed. Note also that x*P_4 need not be +// computed exactly. A straightforward multiplication suffices since +// the rounding error thus produced would be bounded by 2^(-3*64), +// that is 2^(-192) which is small enough as the reduced argument +// is bounded from below by 2^(-75). +// +// Now that we have four 64-bit data representing 2/pi and a +// 64-bit x. We first need to calculate a highly accurate product +// of x and P_1, P_2, P_3. This is best understood as integer +// multiplication. +// +// +// STEP 1. Multiplication +// ---------------------- +// +// +// --------- --------- --------- +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// +// --------- +// X | X | +// --------- +// ---------------------------------------------------- +// +// --------- --------- +// | A_hi | | A_lo | +// --------- --------- +// +// +// --------- --------- +// | B_hi | | B_lo | +// --------- --------- +// +// +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- +// +// ==================================================== +// --------- --------- --------- --------- +// | S_0 | | S_1 | | S_2 | | S_3 | +// --------- --------- --------- --------- +// +// +// +// STEP 2. Get N and f +// ------------------- +// +// Conceptually, after the individual pieces S_0, S_1, ..., are obtained, +// we have to sum them and obtain an integer part, N, and a fraction, f. +// Here, |f| <= 1/2, and N is an integer. Note also that N need only to +// be known to module 2^k, k >= 2. In the case when |f| is small enough, +// we would need to add in the value x*P_4. +// +// +// STEP 3. Get reduced argument +// ---------------------------- +// +// The value f is not yet the reduced argument that we seek. The +// equation +// +// x * 2/pi = 4K + N + f +// +// says that +// +// x = 2*K*pi + N * pi/2 + f * (pi/2). +// +// Thus, the reduced argument is given by +// +// reduced argument = f * pi/2. +// +// This multiplication must be performed to extra precision. +// +// IV. Implementation +// ================== +// +// Step 0. Initialization +// ---------------------- +// +// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. +// +// In memory, 2/pi is stored contigously as +// +// 0x00000000 0x00000000 0xA2F.... +// ^ +// |__ implied binary bit +// +// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus +// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data. +// +// P_0 is the two bits corresponding to bit positions L+2 and L+1 +// P_1 is the 64-bit starting at bit position L +// P_2 is the 64-bit starting at bit position L-64 +// P_3 is the 64-bit starting at bit position L-128 +// P_4 is the 64-bit starting at bit position L-192 +// +// For example, if m = 63, P_0 would be 0 and P_1 would look like +// 0xA2F... +// +// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary. +// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... +// +// Step 1. Multiplication +// ---------------------- +// +// At this point, P_1, P_2, P_3, P_4 are integers. They are +// supposed to be interpreted as +// +// 2^(L-63) * P_1; +// 2^(L-63-64) * P_2; +// 2^(L-63-128) * P_3; +// 2^(L-63-192) * P_4; +// +// Since each of them need to be multiplied to x, we would scale +// both x and the P_j's by some convenient factors: scale each +// of P_j's up by 2^(63-L), and scale x down by 2^(L-63). +// +// p_1 := fcvt.xf ( P_1 ) +// p_2 := fcvt.xf ( P_2 ) * 2^(-64) +// p_3 := fcvt.xf ( P_3 ) * 2^(-128) +// p_4 := fcvt.xf ( P_4 ) * 2^(-192) +// x := replace exponent of x by -1 +// because 2^m * 1.xxxx...xxx * 2^(L-63) +// is 2^(-1) * 1.xxxx...xxx +// +// We are now faced with the task of computing the following +// +// --------- --------- --------- +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// +// --------- +// X | X | +// --------- +// ---------------------------------------------------- +// +// --------- --------- +// | A_hi | | A_lo | +// --------- --------- +// +// --------- --------- +// | B_hi | | B_lo | +// --------- --------- +// +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- +// +// ==================================================== +// ----------- --------- --------- --------- +// | S_0 | | S_1 | | S_2 | | S_3 | +// ----------- --------- --------- --------- +// ^ ^ +// | |___ binary point +// | +// |___ possibly one more bit +// +// Let FPSR3 be set to round towards zero with widest precision +// and exponent range. Unless an explicit FPSR is given, +// round-to-nearest with widest precision and exponent range is +// used. +// +// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65). +// +// Tmp_C := fmpy.fpsr3( x, p_1 ); +// If Tmp_C >= sigma_C then +// C_hi := Tmp_C; +// C_lo := x*p_1 - C_hi ...fma, exact +// Else +// C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C +// ...subtraction is exact, regardless +// ...of rounding direction +// C_lo := x*p_1 - C_hi ...fma, exact +// End If +// +// Tmp_B := fmpy.fpsr3( x, p_2 ); +// If Tmp_B >= sigma_B then +// B_hi := Tmp_B; +// B_lo := x*p_2 - B_hi ...fma, exact +// Else +// B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B +// ...subtraction is exact, regardless +// ...of rounding direction +// B_lo := x*p_2 - B_hi ...fma, exact +// End If +// +// Tmp_A := fmpy.fpsr3( x, p_3 ); +// If Tmp_A >= sigma_A then +// A_hi := Tmp_A; +// A_lo := x*p_3 - A_hi ...fma, exact +// Else +// A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A +// ...subtraction is exact, regardless +// ...of rounding direction +// A_lo := x*p_3 - A_hi ...fma, exact +// End If +// +// ...Note that C_hi is of integer value. We need only the +// ...last few bits. Thus we can ensure C_hi is never a big +// ...integer, freeing us from overflow worry. +// +// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); +// ...Tmp_C is the upper portion of C_hi +// C_hi := C_hi - Tmp_C +// ...0 <= C_hi < 2^7 +// +// Step 2. Get N and f +// ------------------- +// +// At this point, we have all the components to obtain +// S_0, S_1, S_2, S_3 and thus N and f. We start by adding +// C_lo and B_hi. This sum together with C_hi gives a good +// estimation of N and f. +// +// A := fadd.fpsr3( B_hi, C_lo ) +// B := max( B_hi, C_lo ) +// b := min( B_hi, C_lo ) +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-64). +// +// N := round_to_nearest_integer_value( A ); +// f := A - N; ...exact because lsb(A) >= 2^(-64) +// ...and |f| <= 1/2. +// +// f := f + a ...exact because a is 0 or 2^(-64); +// ...the msb of the sum is <= 1/2 +// ...lsb >= 2^(-64). +// +// N := convert to integer format( C_hi + N ); +// M := P_0 * x_lo; +// N := N + M; +// +// If sgn_x == 1 (that is original x was negative) +// N := 2^10 - N +// ...this maintains N to be non-negative, but still +// ...equivalent to the (negated N) mod 4. +// End If +// +// If |f| >= 2^(-33) +// +// ...Case 1 +// CASE := 1 +// g := A_hi + B_lo; +// s_hi := f + g; +// s_lo := (f - s_hi) + g; +// +// Else +// +// ...Case 2 +// CASE := 2 +// A := fadd.fpsr3( A_hi, B_lo ) +// B := max( A_hi, B_lo ) +// b := min( A_hi, B_lo ) +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-128). +// +// f_hi := A + f; +// f_lo := (f - f_hi) + A; +// ...this is exact. +// ...f-f_hi is exact because either |f| >= |A|, in which +// ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A| +// ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64). +// ...If f = 2^(-64), f-f_hi involves cancellation and is +// ...exact. If f = -2^(-64), then A + f is exact. Hence +// ...f-f_hi is -A exactly, giving f_lo = 0. +// +// f_lo := f_lo + a; +// +// If |f| >= 2^(-50) then +// s_hi := f_hi; +// s_lo := f_lo; +// Else +// f_lo := (f_lo + A_lo) + x*p_4 +// s_hi := f_hi + f_lo +// s_lo := (f_hi - s_hi) + f_lo +// End If +// +// End If +// +// Step 3. Get reduced argument +// ---------------------------- +// +// If sgn_x == 0 (that is original x is positive) +// +// D_hi := Pi_by_2_hi +// D_lo := Pi_by_2_lo +// ...load from table +// +// Else +// +// D_hi := neg_Pi_by_2_hi +// D_lo := neg_Pi_by_2_lo +// ...load from table +// End If +// +// r_hi := s_hi*D_hi +// r_lo := s_hi*D_hi - r_hi ...fma +// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi +// +// Return CASE, N, r_hi, r_lo +// + +#include "libm_support.h" + +FR_X = f32 +FR_N = f33 +FR_p_1 = f34 +FR_TWOM33 = f35 +FR_TWOM50 = f36 +FR_g = f37 +FR_p_2 = f38 +FR_f = f39 +FR_s_lo = f40 +FR_p_3 = f41 +FR_f_abs = f42 +FR_D_lo = f43 +FR_p_4 = f44 +FR_D_hi = f45 +FR_Tmp2_C = f46 +FR_s_hi = f47 +FR_sigma_A = f48 +FR_A = f49 +FR_sigma_B = f50 +FR_B = f51 +FR_sigma_C = f52 +FR_b = f53 +FR_ScaleP2 = f54 +FR_ScaleP3 = f55 +FR_ScaleP4 = f56 +FR_Tmp_A = f57 +FR_Tmp_B = f58 +FR_Tmp_C = f59 +FR_A_hi = f60 +FR_f_hi = f61 +FR_r_hi = f62 +FR_A_lo = f63 +FR_B_hi = f64 +FR_a = f65 +FR_B_lo = f66 +FR_f_lo = f67 +FR_r_lo = f68 +FR_C_hi = f69 +FR_C_lo = f70 + +GR_N = r8 +GR_Address_of_Input = r32 +GR_Address_of_Outputs = r33 +GR_Exp_x = r36 +GR_Temp = r37 +GR_BIASL63 = r38 +GR_CASE = r39 +GR_x_lo = r40 +GR_sgn_x = r41 +GR_M = r42 +GR_BASE = r43 +GR_LENGTH1 = r44 +GR_LENGTH2 = r45 +GR_ASUB = r46 +GR_P_0 = r47 +GR_P_1 = r48 +GR_P_2 = r49 +GR_P_3 = r50 +GR_P_4 = r51 +GR_START = r52 +GR_SEGMENT = r53 +GR_A = r54 +GR_B = r55 +GR_C = r56 +GR_D = r57 +GR_E = r58 +GR_TEMP1 = r59 +GR_TEMP2 = r60 +GR_TEMP3 = r61 +GR_TEMP4 = r62 +GR_TEMP5 = r63 +GR_TEMP6 = r64 + +.align 64 + +#ifdef _LIBC +.rodata +#else +.data +#endif + +Constants_Bits_of_2_by_pi: +ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object) +data8 0x0000000000000000,0xA2F9836E4E441529 +data8 0xFC2757D1F534DDC0,0xDB6295993C439041 +data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0 +data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E +data8 0xE88235F52EBB4484,0xE99C7026B45F7E41 +data8 0x3991D639835339F4,0x9C845F8BBDF9283B +data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F +data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D +data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA +data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB +data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08 +data8 0x6599855F14A06840,0x8DFFD8804D732731 +data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4 +data8 0x19C367CDDCE8092A,0x8359C4768B961CA6 +data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8 +data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8 +data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A +data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2 +data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6 +data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7 +data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357 +data8 0xB07AE715175649C0,0xD9D63B3884A7CB23 +data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19 +data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7 +data8 0x652289E83260BFE6,0xCDC4EF09366CD43F +data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628 +data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7 +data8 0x1DF35BE01834132E,0x6212830148835B8E +data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA +data8 0x425FAECE616AA428,0x0AB499D3F2A6067F +data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7 +data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55 +data8 0x36D9CAD2A8288D61,0xC277C9121426049B +data8 0x4612C459C444C5C8,0x91B24DF31700AD43 +data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70 +data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E +data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5 +data8 0x2EC292472F327B6D,0x550C90A7721FE76B +data8 0x96CB314A1679E279,0x4189DFF49794E884 +data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A +data8 0x486CA46742727132,0x5D8DB8159F09E5BC +data8 0x25318D3974F71C05,0x30010C0D68084B58 +data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248 +data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF +data8 0x3398207E4BF56863,0xB25F3EDD035D407F +data8 0x8985295255C06437,0x10D86D324832754C +data8 0x5BD4714E6E5445C1,0x090B69F52AD56614 +data8 0x9D072750045DDB3B,0xB4C576EA17F9877D +data8 0x6B49BA271D296996,0xACCCC65414AD6AE2 +data8 0x9089D98850722CBE,0xA4049407777030F3 +data8 0x27FC00A871EA49C2,0x663DE06483DD9797 +data8 0x3FA3FD94438C860D,0xDE41319D39928C70 +data8 0xDDE7B7173BDF082B,0x3715A0805C93805A +data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876 +data8 0x185915A562BBCB61,0xB989C7BD401004F2 +data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689 +data8 0x768364333B091A94,0x0EAA3A51C2A31DAE +data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F +data8 0x03F6F0098C402B99,0x316D07B43915200C +data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736 +data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B +data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C +data8 0x664D64B705ED3065,0x29BF56573AFF47B9 +data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB +data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36 +data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5 +data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217 +data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4 +data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77 +data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC +data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3 +data8 0x9AAF009353AD0457,0xB6B42D297E804BA7 +data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5 +data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E +data8 0x156E85FF87FD073E,0x2833676186182AEA +data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7 +data8 0x8416DF30432DC735,0x6125CE70C9B8CB30 +data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2 +data8 0x1262845CB9496170,0xE0566B0152993755 +data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85 +data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4 +data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5 +data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D +data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55 +data8 0x0027F0147F8607E1,0x640B148D4196DEBE +data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9 +data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795 +data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538 +data8 0x800120CC86DD71B6,0xDEC9F560BF11654D +data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3 +data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29 +data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9 +data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46 +data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44 +data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22 +data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1 +data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11 +data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2 +data8 0x24BA74607DE58AD8,0x742C150D0C188194 +data8 0x667E162901767A9F,0xBEFDFDEF4556367E +data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1 +data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234 +data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A +data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2 +data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44 +data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4 +data8 0x538C994ECC2254DC,0x552AD6C6C096190B +data8 0xB8701A649569605A,0x26EE523F0F117F11 +data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E +data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1 +data8 0xC68351103ED84871,0xDDDD1C2DA118AF46 +data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06 +data8 0x56AE79E536228922,0xAD38DC9367AAE855 +data8 0x3826829BE7CAA40D,0x51B133990ED7A948 +data8 0x0569F0B265A7887F,0x974C8836D1F9B392 +data8 0x214A827B21CF98DC,0x9F405547DC3A74E1 +data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2 +data8 0xF65523882B55BA41,0x086E59862A218347 +data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C +data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86 +data8 0xC5476243853B8621,0x94792C8761107B4C +data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8 +data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B +data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1 +data8 0x6919949A9529A828,0xCE68B4ED09209F44 +data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7 +data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9 +data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283 +data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED +data8 0x34007700D255F4FC,0x4D59018071E0E13F +data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB +ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi) + +Constants_Bits_of_pi_by_2: +ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object) +data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000 +data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2) + +.section .text +.proc __libm_pi_by_2_reduce# +.global __libm_pi_by_2_reduce# +.align 64 + +__libm_pi_by_2_reduce: + +// X is at the address in Address_of_Input +// Place the two-piece result at the address in Address_of_Outputs +// r followed by c +// N is returned + +{ .mmf +alloc r34 = ar.pfs,2,34,0,0 +(p0) ldfe FR_X = [GR_Address_of_Input] +(p0) fsetc.s3 0x00,0x7F ;; +} +{ .mlx + nop.m 999 +(p0) movl GR_BIASL63 = 0x1003E +} +;; + + +// L -1-2-3-4 +// 0 0 0 0 0. 1 0 1 0 +// M 0 1 2 .... 63, 64 65 ... 127, 128 +// --------------------------------------------- +// Segment 0. 1 , 2 , 3 +// START = M - 63 M = 128 becomes 65 +// LENGTH1 = START & 0x3F 65 become position 1 +// SEGMENT = shr(START,6) + 1 0 maps to 1, 64 maps to 2, +// LENGTH2 = 64 - LENGTH1 +// Address_BASE = shladd(SEGMENT,3) + BASE + + + +{ .mmi + nop.m 999 +(p0) addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_BASE = [GR_BASE] + nop.m 999 + nop.i 999 +} +;; + + +{ .mlx + nop.m 999 +(p0) movl GR_TEMP5 = 0x000000000000FFFE +} +{ .mmi + nop.m 999 ;; +(p0) setf.exp FR_sigma_B = GR_TEMP5 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_TEMP6 = 0x000000000000FFBE ;; +} +// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). +{ .mfi +(p0) setf.exp FR_sigma_A = GR_TEMP6 + nop.f 999 + nop.i 999 ;; +} +// Special Code for testing DE arguments +// (p0) movl GR_BIASL63 = 0x0000000000013FFE +// (p0) movl GR_x_lo = 0xFFFFFFFFFFFFFFFF +// (p0) setf.exp FR_X = GR_BIASL63 +// (p0) setf.sig FR_ScaleP3 = GR_x_lo +// (p0) fmerge.se FR_X = FR_X,FR_ScaleP3 +// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. +// 2/pi is stored contigously as +// 0x00000000 0x00000000.0xA2F.... +// M = EXP - BIAS ( M >= 63) +// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. +// Thus -1 <= L <= -16321. +{ .mmf +(p0) getf.exp GR_Exp_x = FR_X +(p0) getf.sig GR_x_lo = FR_X +(p0) fabs FR_X = FR_X ;; +} +{ .mii +(p0) and GR_x_lo = 0x03,GR_x_lo +(p0) extr.u GR_M = GR_Exp_x,0,17 ;; +(p0) sub GR_START = GR_M,GR_BIASL63 +} +{ .mmi + nop.m 999 ;; +(p0) and GR_LENGTH1 = 0x3F,GR_START +(p0) shr.u GR_SEGMENT = GR_START,6 +} +{ .mmi + nop.m 999 ;; +(p0) add GR_SEGMENT = 0x1,GR_SEGMENT +(p0) sub GR_LENGTH2 = 0x40,GR_LENGTH1 +} +// P_0 is the two bits corresponding to bit positions L+2 and L+1 +// P_1 is the 64-bit starting at bit position L +// P_2 is the 64-bit starting at bit position L-64 +// P_3 is the 64-bit starting at bit position L-128 +// P_4 is the 64-bit starting at bit position L-192 +// P_1 is made up of Alo and Bhi +// P_1 = deposit Alo, position 0, length2 into P_1,position length1 +// deposit Bhi, position length2, length1 into P_1, position 0 +// P_2 is made up of Blo and Chi +// P_2 = deposit Blo, position 0, length2 into P_2, position length1 +// deposit Chi, position length2, length1 into P_2, position 0 +// P_3 is made up of Clo and Dhi +// P_3 = deposit Clo, position 0, length2 into P_3, position length1 +// deposit Dhi, position length2, length1 into P_3, position 0 +// P_4 is made up of Clo and Dhi +// P_4 = deposit Dlo, position 0, length2 into P_4, position length1 +// deposit Ehi, position length2, length1 into P_4, position 0 +{ .mmi +(p0) cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;; +(p0) shladd GR_BASE = GR_SEGMENT,3,GR_BASE +(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;; +} +{ .mmi + nop.m 999 +// ld_64 A at Base and increment Base by 8 +// ld_64 B at Base and increment Base by 8 +// ld_64 C at Base and increment Base by 8 +// ld_64 D at Base and increment Base by 8 +// ld_64 E at Base and increment Base by 8 +// A/B/C/D +// --------------------- +// A, B, C, D, and E look like | length1 | length2 | +// --------------------- +// hi lo +(p0) ld8 GR_A = [GR_BASE],8 +(p0) extr.u GR_sgn_x = GR_Exp_x,17,1 ;; +} +{ .mmf + nop.m 999 +(p0) ld8 GR_B = [GR_BASE],8 +(p0) fmerge.se FR_X = FR_sigma_B,FR_X ;; +} +{ .mii +(p0) ld8 GR_C = [GR_BASE],8 +(p8) extr.u GR_Temp = GR_A,63,1 ;; +(p0) shl GR_TEMP1 = GR_A,GR_LENGTH1 +} +{ .mii +(p0) ld8 GR_D = [GR_BASE],8 +// If length1 >= 2, +// P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0. +(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 ;; +(p0) shl GR_TEMP2 = GR_B,GR_LENGTH1 +} +{ .mii +(p0) ld8 GR_E = [GR_BASE],-40 +(p0) shr.u GR_P_1 = GR_B,GR_LENGTH2 ;; +(p0) shr.u GR_P_2 = GR_C,GR_LENGTH2 +} +// Else +// Load 16 bit of ASUB from (Base_Address_of_A - 2) +// P_0 = ASUB & 0x3 +// If length1 == 0, +// P_0 complete +// Else +// Deposit element 63 from Ahi and place in element 0 of P_0. +// Endif +// Endif +{ .mii +(p7) ld2 GR_ASUB = [GR_BASE],8 +(p0) shl GR_TEMP3 = GR_C,GR_LENGTH1 ;; +(p0) shl GR_TEMP4 = GR_D,GR_LENGTH1 +} +{ .mii + nop.m 999 +(p0) shr.u GR_P_3 = GR_D,GR_LENGTH2 ;; +(p0) shr.u GR_P_4 = GR_E,GR_LENGTH2 +} +{ .mii +(p7) and GR_P_0 = 0x03,GR_ASUB +(p6) and GR_P_0 = 0x03,GR_P_0 ;; +(p0) or GR_P_1 = GR_P_1,GR_TEMP1 +} +{ .mmi +(p8) and GR_P_0 = 0x1,GR_P_0 ;; +(p0) or GR_P_2 = GR_P_2,GR_TEMP2 +(p8) shl GR_P_0 = GR_P_0,0x1 ;; +} +{ .mii + nop.m 999 +(p0) or GR_P_3 = GR_P_3,GR_TEMP3 +(p8) or GR_P_0 = GR_P_0,GR_Temp +} +{ .mmi +(p0) setf.sig FR_p_1 = GR_P_1 ;; +(p0) setf.sig FR_p_2 = GR_P_2 +(p0) or GR_P_4 = GR_P_4,GR_TEMP4 ;; +} +{ .mmi + nop.m 999 ;; +(p0) setf.sig FR_p_3 = GR_P_3 +(p0) pmpy2.r GR_M = GR_P_0,GR_x_lo +} +{ .mlx +(p0) setf.sig FR_p_4 = GR_P_4 +// P_1, P_2, P_3, P_4 are integers. They should be +// 2^(L-63) * P_1; +// 2^(L-63-64) * P_2; +// 2^(L-63-128) * P_3; +// 2^(L-63-192) * P_4; +// Since each of them need to be multiplied to x, we would scale +// both x and the P_j's by some convenient factors: scale each +// of P_j's up by 2^(63-L), and scale x down by 2^(L-63). +// p_1 := fcvt.xf ( P_1 ) +// p_2 := fcvt.xf ( P_2 ) * 2^(-64) +// p_3 := fcvt.xf ( P_3 ) * 2^(-128) +// p_4 := fcvt.xf ( P_4 ) * 2^(-192) +// x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx +// --------- --------- --------- +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// --------- +// X | X | +// --------- +// ---------------------------------------------------- +// --------- --------- +// | A_hi | | A_lo | +// --------- --------- +// --------- --------- +// | B_hi | | B_lo | +// --------- --------- +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- +// ==================================================== +// ----------- --------- --------- --------- +// | S_0 | | S_1 | | S_2 | | S_3 | +// ----------- --------- --------- --------- +// | |___ binary point +// |___ possibly one more bit +// +// Let FPSR3 be set to round towards zero with widest precision +// and exponent range. Unless an explicit FPSR is given, +// round-to-nearest with widest precision and exponent range is +// used. +(p0) movl GR_TEMP1 = 0x000000000000FFBF +} +{ .mmi + nop.m 999 ;; +(p0) setf.exp FR_ScaleP2 = GR_TEMP1 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_TEMP4 = 0x000000000001003E +} +{ .mmi + nop.m 999 ;; +(p0) setf.exp FR_sigma_C = GR_TEMP4 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_TEMP2 = 0x000000000000FF7F ;; +} +{ .mmf + nop.m 999 +(p0) setf.exp FR_ScaleP3 = GR_TEMP2 +(p0) fcvt.xuf.s1 FR_p_1 = FR_p_1 ;; +} +{ .mfi + nop.m 999 +(p0) fcvt.xuf.s1 FR_p_2 = FR_p_2 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_Temp = 0x000000000000FFDE ;; +} +{ .mmf + nop.m 999 +(p0) setf.exp FR_TWOM33 = GR_Temp +(p0) fcvt.xuf.s1 FR_p_3 = FR_p_3 ;; +} +{ .mfi + nop.m 999 +(p0) fcvt.xuf.s1 FR_p_4 = FR_p_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// Tmp_C := fmpy.fpsr3( x, p_1 ); +// Tmp_B := fmpy.fpsr3( x, p_2 ); +// Tmp_A := fmpy.fpsr3( x, p_3 ); +// If Tmp_C >= sigma_C then +// C_hi := Tmp_C; +// C_lo := x*p_1 - C_hi ...fma, exact +// Else +// C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C +// C_lo := x*p_1 - C_hi ...fma, exact +// End If +// If Tmp_B >= sigma_B then +// B_hi := Tmp_B; +// B_lo := x*p_2 - B_hi ...fma, exact +// Else +// B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B +// B_lo := x*p_2 - B_hi ...fma, exact +// End If +// If Tmp_A >= sigma_A then +// A_hi := Tmp_A; +// A_lo := x*p_3 - A_hi ...fma, exact +// Else +// A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A +// Exact, regardless ...of rounding direction +// A_lo := x*p_3 - A_hi ...fma, exact +// Endif +(p0) fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_Temp = 0x0000000000000400 +} +{ .mlx + nop.m 999 +(p0) movl GR_TEMP3 = 0x000000000000FF3F ;; +} +{ .mmf + nop.m 999 +(p0) setf.exp FR_ScaleP4 = GR_TEMP3 +(p0) fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;; +} +{ .mlx + nop.m 999 +(p0) movl GR_TEMP4 = 0x0000000000010045 ;; +} +{ .mmf + nop.m 999 +(p0) setf.exp FR_Tmp2_C = GR_TEMP4 +(p0) fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p12, p9 = FR_Tmp_C,FR_sigma_C + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) mov FR_C_hi = FR_Tmp_C + nop.i 999 ;; +} +{ .mfi +(p0) addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp +(p9) fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C + nop.i 999 +} +;; + + + +// End If +// Step 3. Get reduced argument +// If sgn_x == 0 (that is original x is positive) +// D_hi := Pi_by_2_hi +// D_lo := Pi_by_2_lo +// Load from table +// Else +// D_hi := neg_Pi_by_2_hi +// D_lo := neg_Pi_by_2_lo +// Load from table +// End If + + +{ .mmi + ld8 GR_BASE = [GR_BASE] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi +(p0) ldfe FR_D_hi = [GR_BASE],16 +(p0) fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 + nop.i 999 ;; +} +{ .mfi +(p0) ldfe FR_D_lo = [GR_BASE],0 +(p0) fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) mov FR_B_hi = FR_Tmp_B + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) mov FR_A_hi = FR_Tmp_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi +(p0) cmp.eq.unc p12,p9 = 0x1,GR_sgn_x +} +{ .mfi + nop.m 999 +(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B + nop.i 999 +} +{ .mfi + nop.m 999 +// Note that C_hi is of integer value. We need only the +// last few bits. Thus we can ensure C_hi is never a big +// integer, freeing us from overflow worry. +// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); +// Tmp_C is the upper portion of C_hi +(p0) fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// ******************* +// Step 2. Get N and f +// ******************* +// We have all the components to obtain +// S_0, S_1, S_2, S_3 and thus N and f. We start by adding +// C_lo and B_hi. This sum together with C_hi estimates +// N and f well. +// A := fadd.fpsr3( B_hi, C_lo ) +// B := max( B_hi, C_lo ) +// b := min( B_hi, C_lo ) +(p0) fadd.s3 FR_A = FR_B_hi,FR_C_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmax.s1 FR_B = FR_B_hi,FR_C_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmin.s1 FR_b = FR_B_hi,FR_C_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// N := round_to_nearest_integer_value( A ); +(p0) fcvt.fx.s1 FR_N = FR_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 +(p0) fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). +(p0) fsub.s1 FR_a = FR_B,FR_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. +(p0) fnorm.s1 FR_N = FR_N + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_a = FR_a,FR_b + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_f = FR_A,FR_N + nop.i 999 +} +{ .mfi + nop.m 999 +// N := convert to integer format( C_hi + N ); +// M := P_0 * x_lo; +// N := N + M; +(p0) fadd.s1 FR_N = FR_N,FR_C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// f = f + a Exact because a is 0 or 2^(-64); +// the msb of the sum is <= 1/2 and lsb >= 2^(-64). +(p0) fadd.s1 FR_f = FR_f,FR_a + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Create 2**(-33) +// +(p0) fcvt.fx.s1 FR_N = FR_N + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fabs FR_f_abs = FR_f + nop.i 999 ;; +} +{ .mfi +(p0) getf.sig GR_N = FR_N + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) add GR_N = GR_N,GR_M ;; +} +// If sgn_x == 1 (that is original x was negative) +// N := 2^10 - N +// this maintains N to be non-negative, but still +// equivalent to the (negated N) mod 4. +// End If +{ .mii +(p12) sub GR_N = GR_Temp,GR_N +(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;; + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fsub.s1 FR_D_hi = f0, FR_D_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fadd.s3 FR_A = FR_A_hi,FR_B_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fadd.s1 FR_g = FR_A_hi,FR_B_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fmax.s1 FR_B = FR_A_hi,FR_B_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fsub.s1 FR_D_lo = f0, FR_D_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fmin.s1 FR_b = FR_A_hi,FR_B_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mlx + nop.m 999 +(p10) movl GR_Temp = 0x000000000000FFCD ;; +} +{ .mmf + nop.m 999 +(p10) setf.exp FR_TWOM50 = GR_Temp +(p10) fadd.s1 FR_f_hi = FR_A,FR_f ;; +} +{ .mfi + nop.m 999 +// a := (B - A) + b Exact. +// Note that a is either 0 or 2^(-128). +// f_hi := A + f; +// f_lo := (f - f_hi) + A +// f_lo=f-f_hi is exact because either |f| >= |A|, in which +// case f-f_hi is clearly exact; or otherwise, 0<|f|<|A| +// means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64). +// If f = 2^(-64), f-f_hi involves cancellation and is +// exact. If f = -2^(-64), then A + f is exact. Hence +// f-f_hi is -A exactly, giving f_lo = 0. +// f_lo := f_lo + a; +(p10) fsub.s1 FR_a = FR_B,FR_A + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fadd.s1 FR_s_hi = FR_f,FR_g + nop.i 999 ;; +} +{ .mlx + nop.m 999 +// If |f| >= 2^(-33) +// Case 1 +// CASE := 1 +// g := A_hi + B_lo; +// s_hi := f + g; +// s_lo := (f - s_hi) + g; +(p13) movl GR_CASE = 0x1 ;; +} +{ .mlx + nop.m 999 +// Else +// Case 2 +// CASE := 2 +// A := fadd.fpsr3( A_hi, B_lo ) +// B := max( A_hi, B_lo ) +// b := min( A_hi, B_lo ) +(p10) movl GR_CASE = 0x2 +} +{ .mfi + nop.m 999 +(p10) fsub.s1 FR_f_lo = FR_f,FR_f_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fadd.s1 FR_a = FR_a,FR_b + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fsub.s1 FR_s_lo = FR_f,FR_s_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fadd.s1 FR_s_lo = FR_s_lo,FR_g + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Create 2**(-50) +(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// If |f| >= 2^(-50) then +// s_hi := f_hi; +// s_lo := f_lo; +// Else +// f_lo := (f_lo + A_lo) + x*p_4 +// s_hi := f_hi + f_lo +// s_lo := (f_hi - s_hi) + f_lo +// End If +(p14) mov FR_s_hi = FR_f_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) mov FR_s_lo = FR_f_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// r_hi := s_hi*D_hi +// r_lo := s_hi*D_hi - r_hi with fma +// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi +(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo + nop.i 999 ;; +} +{ .mmi + nop.m 999 ;; +// Return N, r_hi, r_lo +// We do not return CASE +(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo + nop.i 999 ;; +} +{ .mmi + nop.m 999 ;; +(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16 + nop.i 999 +} +{ .mib + nop.m 999 + nop.i 999 +(p0) br.ret.sptk b0 ;; +} + +.endp __libm_pi_by_2_reduce +ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce) diff --git a/sysdeps/ia64/fpu/libm_support.h b/sysdeps/ia64/fpu/libm_support.h new file mode 100644 index 0000000000..995b104388 --- /dev/null +++ b/sysdeps/ia64/fpu/libm_support.h @@ -0,0 +1,339 @@ +// +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// + +// History: 02/02/2000 Initial version +// 2/28/2000 added tags for logb and nextafter +// 3/22/2000 Changes to support _LIB_VERSION variable +// and filled some enum gaps. Added support for C99. +// 5/31/2000 added prototypes for __libm_frexp_4l/8l +// 8/10/2000 Changed declaration of _LIB_VERSION to work for library +// builds and other application builds (precompiler directives). +// 8/11/2000 Added pointers-to-matherr-functions declarations to allow +// for user-defined matherr functions in the dll build. +// 12/07/2000 Added scalbn error_types values. +// + +#ifndef ASSEMBLER +#include <math.h> + +float __libm_frexp_4f( float x, int* exp); +float __libm_frexp_8f( float x, int* exp); +double __libm_frexp_4( double x, int* exp); +double __libm_frexp_8( double x, int* exp); +long double __libm_frexp_4l( long double x, int* exp); +long double __libm_frexp_8l( long double x, int* exp); +void __libm_sincos_pi4(double,double*,double*,int); +void __libm_y0y1(double , double *, double *); +void __libm_j0j1(double , double *, double *); +double __libm_lgamma_kernel(double,int*,int,int); +double __libm_j0(double); +double __libm_j1(double); +double __libm_jn(int,double); +double __libm_y0(double); +double __libm_y1(double); +double __libm_yn(int,double); + +extern double rint(double); +extern double sqrt(double); +extern double fabs(double); +extern double log(double); +extern double log1p(double); +extern double sqrt(double); +extern double sin(double); +extern double exp(double); +extern double modf(double, double *); +extern double asinh(double); +extern double acosh(double); +extern double atanh(double); +extern double tanh(double); +extern double erf(double); +extern double erfc(double); +extern double j0(double); +extern double j1(double); +extern double jn(int, double); +extern double y0(double); +extern double y1(double); +extern double yn(int, double); + +extern float fabsf(float); +extern float asinhf(float); +extern float acoshf(float); +extern float atanhf(float); +extern float tanhf(float); +extern float erff(float); +extern float erfcf(float); +extern float j0f(float); +extern float j1f(float); +extern float jnf(int, float); +extern float y0f(float); +extern float y1f(float); +extern float ynf(int, float); + +extern long double log1pl(long double); +extern long double logl(long double); +extern long double sqrtl(long double); +extern long double expl(long double); + +extern long lround(double); +extern long lroundf(float); +extern long lroundl(long double); + +#if !(defined(SIZE_INT_32) || defined(SIZE_INT_64)) + #error integer size not established; define SIZE_INT_32 or SIZE_INT_64 +#endif + +struct fp64 { /*/ sign:1 exponent:11 significand:52 (implied leading 1)*/ + unsigned lo_significand:32; + unsigned hi_significand:20; + unsigned exponent:11; + unsigned sign:1; +}; + +#define HI_SIGNIFICAND_LESS(X, HI) ((X)->hi_significand < 0x ## HI) +#define f64abs(x) ((x) < 0.0 ? -(x) : (x)) + +typedef enum +{ + logl_zero=0, logl_negative, /* 0, 1 */ + log_zero, log_negative, /* 2, 3 */ + logf_zero, logf_negative, /* 4, 5 */ + log10l_zero, log10l_negative, /* 6, 7 */ + log10_zero, log10_negative, /* 8, 9 */ + log10f_zero, log10f_negative, /* 10, 11 */ + expl_overflow, expl_underflow, /* 12, 13 */ + exp_overflow, exp_underflow, /* 14, 15 */ + expf_overflow, expf_underflow, /* 16, 17 */ + powl_overflow, powl_underflow, /* 18, 19 */ + powl_zero_to_zero, /* 20 */ + powl_zero_to_negative, /* 21 */ + powl_neg_to_non_integer, /* 22 */ + powl_nan_to_zero, /* 23 */ + pow_overflow, pow_underflow, /* 24, 25 */ + pow_zero_to_zero, /* 26 */ + pow_zero_to_negative, /* 27 */ + pow_neg_to_non_integer, /* 28 */ + pow_nan_to_zero, /* 29 */ + powf_overflow, powf_underflow, /* 30, 31 */ + powf_zero_to_zero, /* 32 */ + powf_zero_to_negative, /* 33 */ + powf_neg_to_non_integer, /* 34 */ + powf_nan_to_zero, /* 35 */ + atan2l_zero, /* 36 */ + atan2_zero, /* 37 */ + atan2f_zero, /* 38 */ + expm1l_overflow, /* 39 */ + expm1l_underflow, /* 40 */ + expm1_overflow, /* 41 */ + expm1_underflow, /* 42 */ + expm1f_overflow, /* 43 */ + expm1f_underflow, /* 44 */ + hypotl_overflow, /* 45 */ + hypot_overflow, /* 46 */ + hypotf_overflow, /* 47 */ + sqrtl_negative, /* 48 */ + sqrt_negative, /* 49 */ + sqrtf_negative, /* 50 */ + scalbl_overflow, scalbl_underflow, /* 51, 52 */ + scalb_overflow, scalb_underflow, /* 53, 54 */ + scalbf_overflow, scalbf_underflow, /* 55, 56 */ + acosl_gt_one, acos_gt_one, acosf_gt_one, /* 57, 58, 59 */ + asinl_gt_one, asin_gt_one, asinf_gt_one, /* 60, 61, 62 */ + coshl_overflow, cosh_overflow, coshf_overflow, /* 63, 64, 65 */ + y0l_zero, y0l_negative,y0l_gt_loss, /* 66, 67, 68 */ + y0_zero, y0_negative,y0_gt_loss, /* 69, 70, 71 */ + y0f_zero, y0f_negative,y0f_gt_loss, /* 72, 73, 74 */ + y1l_zero, y1l_negative,y1l_gt_loss, /* 75, 76, 77 */ + y1_zero, y1_negative,y1_gt_loss, /* 78, 79, 80 */ + y1f_zero, y1f_negative,y1f_gt_loss, /* 81, 82, 83 */ + ynl_zero, ynl_negative,ynl_gt_loss, /* 84, 85, 86 */ + yn_zero, yn_negative,yn_gt_loss, /* 87, 88, 89 */ + ynf_zero, ynf_negative,ynf_gt_loss, /* 90, 91, 92 */ + j0l_gt_loss, /* 93 */ + j0_gt_loss, /* 94 */ + j0f_gt_loss, /* 95 */ + j1l_gt_loss, /* 96 */ + j1_gt_loss, /* 97 */ + j1f_gt_loss, /* 98 */ + jnl_gt_loss, /* 99 */ + jn_gt_loss, /* 100 */ + jnf_gt_loss, /* 101 */ + lgammal_overflow, lgammal_negative,lgammal_reserve, /* 102, 103, 104 */ + lgamma_overflow, lgamma_negative,lgamma_reserve, /* 105, 106, 107 */ + lgammaf_overflow, lgammaf_negative, lgammaf_reserve,/* 108, 109, 110 */ + gammal_overflow,gammal_negative, gammal_reserve, /* 111, 112, 113 */ + gamma_overflow, gamma_negative, gamma_reserve, /* 114, 115, 116 */ + gammaf_overflow,gammaf_negative,gammaf_reserve, /* 117, 118, 119 */ + fmodl_by_zero, /* 120 */ + fmod_by_zero, /* 121 */ + fmodf_by_zero, /* 122 */ + remainderl_by_zero, /* 123 */ + remainder_by_zero, /* 124 */ + remainderf_by_zero, /* 125 */ + sinhl_overflow, sinh_overflow, sinhf_overflow, /* 126, 127, 128 */ + atanhl_gt_one, atanhl_eq_one, /* 129, 130 */ + atanh_gt_one, atanh_eq_one, /* 131, 132 */ + atanhf_gt_one, atanhf_eq_one, /* 133, 134 */ + acoshl_lt_one, /* 135 */ + acosh_lt_one, /* 136 */ + acoshf_lt_one, /* 137 */ + log1pl_zero, log1pl_negative, /* 138, 139 */ + log1p_zero, log1p_negative, /* 140, 141 */ + log1pf_zero, log1pf_negative, /* 142, 143 */ + ldexpl_overflow, ldexpl_underflow, /* 144, 145 */ + ldexp_overflow, ldexp_underflow, /* 146, 147 */ + ldexpf_overflow, ldexpf_underflow, /* 148, 149 */ + logbl_zero, logb_zero, logbf_zero, /* 150, 151, 152 */ + nextafterl_overflow, nextafter_overflow, + nextafterf_overflow, /* 153, 154, 155 */ + ilogbl_zero, ilogb_zero, ilogbf_zero, /* 156, 157, 158 */ + exp2l_overflow, exp2l_underflow, /* 159, 160 */ + exp2_overflow, exp2_underflow, /* 161, 162 */ + exp2f_overflow, exp2f_underflow, /* 163, 164 */ + exp10l_overflow, exp10_overflow, + exp10f_overflow, /* 165, 166, 167 */ + log2l_zero, log2l_negative, /* 168, 169 */ + log2_zero, log2_negative, /* 170, 171 */ + log2f_zero, log2f_negative, /* 172, 173 */ + scalbnl_overflow, scalbnl_underflow, /* 174, 175 */ + scalbn_overflow, scalbn_underflow, /* 176, 177 */ + scalbnf_overflow, scalbnf_underflow /* 178, 179 */ +} error_types; + +void __libm_error_support(void*,void*,void*,error_types); + +#define BIAS_64 1023 +#define EXPINF_64 2047 + +#define DOUBLE_HEX(HI, LO) 0x ## LO, 0x ## HI + +#if 0 +static const unsigned INF[] = { + DOUBLE_HEX(7ff00000, 00000000), + DOUBLE_HEX(fff00000, 00000000) +}; + +static const double _zeroo = 0.0; +static const double _bigg = 1.0e300; +static const double _ponee = 1.0; +static const double _nonee = -1.0; + +#define INVALID (_zeroo * *((double*)&INF[0])) +#define PINF *((double*)&INF[0]) +#define NINF -PINF +#define PINF_DZ (_ponee/_zeroo) +#define X_TLOSS 1.41484755040568800000e+16 +#endif + +struct exceptionf +{ + int type; + char *name; + float arg1, arg2, retval; +}; + +# ifdef __cplusplus +struct __exception +{ + int type; + char *name; + double arg1, arg2, retval; +}; +# else + +# ifndef _LIBC +struct exception +{ + int type; + char *name; + double arg1, arg2, retval; +}; +# endif +# endif + + + +struct exceptionl +{ + int type; + char *name; + long double arg1, arg2, retval; +}; + +#ifdef _MS_ +#define MATHERR_F _matherrf +#define MATHERR_D _matherr +#else +#define MATHERR_F matherrf +#define MATHERR_D matherr +#endif + +# ifdef __cplusplus +#define EXC_DECL_D __exception +#else +// exception is a reserved name in C++ +#define EXC_DECL_D exception +#endif + +extern int MATHERR_F(struct exceptionf*); +extern int MATHERR_D(struct EXC_DECL_D*); +extern int matherrl(struct exceptionl*); + + +/* Set these appropriately to make thread Safe */ +#define ERRNO_RANGE errno = ERANGE +#define ERRNO_DOMAIN errno = EDOM + + +// Add code to support _LIB_VERSION +#ifndef _LIBC +typedef enum +{ + _IEEE_ = -1, // IEEE-like behavior + _SVID_, // SysV, Rel. 4 behavior + _XOPEN_, // Unix98 + _POSIX_, // Posix + _ISOC_ // ISO C9X +} _LIB_VERSION_TYPE; + +extern _LIB_VERSION_TYPE _LIB_VERSION; +#endif + +// This is a run-time variable and may effect +// floating point behavior of the libm functions + +#endif /* ASSEMBLER */ + +/* Support for compatible assembler handling. */ +#if !defined L && defined _LIBC +#define L(name) .L##name +#endif +#ifdef __ELF__ +#define ASM_SIZE_DIRECTIVE(name) .size name,.-name +#define ASM_TYPE_DIRECTIVE(name,T) .type name,T +#else +#define ASM_SIZE_DIRECTIVE(name) +#define ASM_TYPE_DIRECTIVE(name,T) +#endif diff --git a/sysdeps/ia64/fpu/libm_tan.S b/sysdeps/ia64/fpu/libm_tan.S new file mode 100644 index 0000000000..c587d6433c --- /dev/null +++ b/sysdeps/ia64/fpu/libm_tan.S @@ -0,0 +1,3319 @@ +.file "libm_tan.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 02/02/00 Initial Version +// 4/04/00 Unwind support added +// 12/28/00 Fixed false invalid flags +// +// ********************************************************************* +// +// Function: tan(x) = tangent(x), for double precision x values +// +// ********************************************************************* +// +// Accuracy: Very accurate for double-precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9-f15 +// f32-f112 +// +// General Purpose Registers: +// r32-r48 +// r49-r50 (Used to pass arguments to pi_by_2 reduce routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions do not occur +// Underflow exceptions raised when appropriate for tan +// (No specialized error handling for this routine) +// Inexact raised when appropriate by algorithm +// +// tan(SNaN) = QNaN +// tan(QNaN) = QNaN +// tan(inf) = QNaN +// tan(+/-0) = +/-0 +// +// ********************************************************************* +// +// Mathematical Description +// +// We consider the computation of FPTAN of Arg. Now, given +// +// Arg = N pi/2 + alpha, |alpha| <= pi/4, +// +// basic mathematical relationship shows that +// +// tan( Arg ) = tan( alpha ) if N is even; +// = -cot( alpha ) otherwise. +// +// The value of alpha is obtained by argument reduction and +// represented by two working precision numbers r and c where +// +// alpha = r + c accurately. +// +// The reduction method is described in a previous write up. +// The argument reduction scheme identifies 4 cases. For Cases 2 +// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be +// computed very easily by 2 or 3 terms of the Taylor series +// expansion as follows: +// +// Case 2: +// ------- +// +// tan(r + c) = r + c + r^3/3 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 ...accurately +// +// Case 4: +// ------- +// +// tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately +// +// +// The only cases left are Cases 1 and 3 of the argument reduction +// procedure. These two cases will be merged since after the +// argument is reduced in either cases, we have the reduced argument +// represented as r + c and that the magnitude |r + c| is not small +// enough to allow the usage of a very short approximation. +// +// The greatest challenge of this task is that the second terms of +// the Taylor series for tan(r) and -cot(r) +// +// r + r^3/3 + 2 r^5/15 + ... +// +// and +// +// -1/r + r/3 + r^3/45 + ... +// +// are not very small when |r| is close to pi/4 and the rounding +// errors will be a concern if simple polynomial accumulation is +// used. When |r| < 2^(-2), however, the second terms will be small +// enough (5 bits or so of right shift) that a normal Horner +// recurrence suffices. Hence there are two cases that we consider +// in the accurate computation of tan(r) and cot(r), |r| <= pi/4. +// +// Case small_r: |r| < 2^(-2) +// -------------------------- +// +// Since Arg = N pi/4 + r + c accurately, we have +// +// tan(Arg) = tan(r+c) for N even, +// = -cot(r+c) otherwise. +// +// Here for this case, both tan(r) and -cot(r) can be approximated +// by simple polynomials: +// +// tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// +// accurately. Since |r| is relatively small, tan(r+c) and +// -cot(r+c) can be accurately approximated by replacing r with +// r+c only in the first two terms of the corresponding polynomials. +// +// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to +// almost 64 sig. bits, thus +// +// P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately. +// +// Hence, +// +// tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// + c*(1 + r^2) +// +// -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c +// +// +// Case normal_r: 2^(-2) <= |r| <= pi/4 +// ------------------------------------ +// +// This case is more likely than the previous one if one considers +// r to be uniformly distributed in [-pi/4 pi/4]. +// +// The required calculation is either +// +// tan(r + c) = tan(r) + correction, or +// -cot(r + c) = -cot(r) + correction. +// +// Specifically, +// +// tan(r + c) = tan(r) + c tan'(r) + O(c^2) +// = tan(r) + c sec^2(r) + O(c^2) +// = tan(r) + c SEC_sq ...accurately +// as long as SEC_sq approximates sec^2(r) +// to, say, 5 bits or so. +// +// Similarly, +// +// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) +// = -cot(r) + c csc^2(r) + O(c^2) +// = -cot(r) + c CSC_sq ...accurately +// as long as CSC_sq approximates csc^2(r) +// to, say, 5 bits or so. +// +// We therefore concentrate on accurately calculating tan(r) and +// cot(r) for a working-precision number r, |r| <= pi/4 to within +// 0.1% or so. +// +// We will employ a table-driven approach. Let +// +// r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63 +// = sgn_r * ( B + x ) +// +// where +// +// B = 2^k * 1.b_1 b_2 ... b_5 1 +// x = |r| - B +// +// Now, +// tan(B) + tan(x) +// tan( B + x ) = ------------------------ +// 1 - tan(B)*tan(x) +// +// / \ +// | tan(B) + tan(x) | + +// = tan(B) + | ------------------------ - tan(B) | +// | 1 - tan(B)*tan(x) | +// \ / +// +// sec^2(B) * tan(x) +// = tan(B) + ------------------------ +// 1 - tan(B)*tan(x) +// +// (1/[sin(B)*cos(B)]) * tan(x) +// = tan(B) + -------------------------------- +// cot(B) - tan(x) +// +// +// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Since +// +// |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2) +// +// a very short polynomial will be sufficient to approximate tan(x) +// accurately. The details involved in computing the last expression +// will be given in the next section on algorithm description. +// +// +// Now, we turn to the case where cot( B + x ) is needed. +// +// +// 1 - tan(B)*tan(x) +// cot( B + x ) = ------------------------ +// tan(B) + tan(x) +// +// / \ +// | 1 - tan(B)*tan(x) | + +// = cot(B) + | ----------------------- - cot(B) | +// | tan(B) + tan(x) | +// \ / +// +// [tan(B) + cot(B)] * tan(x) +// = cot(B) - ---------------------------- +// tan(B) + tan(x) +// +// (1/[sin(B)*cos(B)]) * tan(x) +// = cot(B) - -------------------------------- +// tan(B) + tan(x) +// +// +// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that +// are needed are the same set of values needed in the previous +// case. +// +// Finally, we can put all the ingredients together as follows: +// +// Arg = N * pi/2 + r + c ...accurately +// +// tan(Arg) = tan(r) + correction if N is even; +// = -cot(r) + correction otherwise. +// +// For Cases 2 and 4, +// +// Case 2: +// tan(Arg) = tan(r + c) = r + c + r^3/3 N even +// = -cot(r + c) = -1/(r+c) + r/3 N odd +// Case 4: +// tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even +// = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd +// +// +// For Cases 1 and 3, +// +// Case small_r: |r| < 2^(-2) +// +// tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// + c*(1 + r^2) N even +// +// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c N odd +// +// Case normal_r: 2^(-2) <= |r| <= pi/4 +// +// tan(Arg) = tan(r) + c * sec^2(r) N even +// = -cot(r) + c * csc^2(r) otherwise +// +// For N even, +// +// tan(Arg) = tan(r) + c*sec^2(r) +// = tan( sgn_r * (B+x) ) + c * sec^2(|r|) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) +// +// since B approximates |r| to 2^(-6) in relative accuracy. +// +// / (1/[sin(B)*cos(B)]) * tan(x) +// tan(Arg) = sgn_r * | tan(B) + -------------------------------- +// \ cot(B) - tan(x) +// \ +// + CORR | + +// / +// where +// +// CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). +// +// For N odd, +// +// tan(Arg) = -cot(r) + c*csc^2(r) +// = -cot( sgn_r * (B+x) ) + c * csc^2(|r|) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) +// +// since B approximates |r| to 2^(-6) in relative accuracy. +// +// / (1/[sin(B)*cos(B)]) * tan(x) +// tan(Arg) = sgn_r * | -cot(B) + -------------------------------- +// \ tan(B) + tan(x) +// \ +// + CORR | + +// / +// where +// +// CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). +// +// +// The actual algorithm prescribes how all the mathematical formulas +// are calculated. +// +// +// 2. Algorithmic Description +// ========================== +// +// 2.1 Computation for Cases 2 and 4. +// ---------------------------------- +// +// For Case 2, we use two-term polynomials. +// +// For N even, +// +// rsq := r * r +// Result := c + r * rsq * P1_1 +// Result := r + Result ...in user-defined rounding +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// S_lo := S_lo + Q1_1*r +// +// Result := S_hi + S_lo ...in user-defined rounding +// +// For Case 4, we use three-term polynomials +// +// For N even, +// +// rsq := r * r +// Result := c + r * rsq * (P1_1 + rsq * P1_2) +// Result := r + Result ...in user-defined rounding +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// rsq := r * r +// P := Q1_1 + rsq*Q1_2 +// S_lo := S_lo + r*P +// +// Result := S_hi + S_lo ...in user-defined rounding +// +// +// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are +// the same as those used in the small_r case of Cases 1 and 3 +// below. +// +// +// 2.2 Computation for Cases 1 and 3. +// ---------------------------------- +// This is further divided into the case of small_r, +// where |r| < 2^(-2), and the case of normal_r, where |r| lies between +// 2^(-2) and pi/4. +// +// Algorithm for the case of small_r +// --------------------------------- +// +// For N even, +// rsq := r * r +// Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3)) +// r_to_the_8 := rsq * rsq +// r_to_the_8 := r_to_the_8 * r_to_the_8 +// Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9)) +// CORR := c * ( 1 + rsq ) +// Poly := Poly1 + r_to_the_8*Poly2 +// Result := r*Poly + CORR +// Result := r + Result ...in user-defined rounding +// ...note that Poly1 and r_to_the_8 can be computed in parallel +// ...with Poly2 (Poly1 is intentionally set to be much +// ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden) +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// S_lo := S_lo + Q1_1*c +// +// ...S_hi and S_lo are computed in parallel with +// ...the following +// rsq := r*r +// P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7)) +// +// Result := r*P + S_lo +// Result := S_hi + Result ...in user-defined rounding +// +// +// Algorithm for the case of normal_r +// ---------------------------------- +// +// Here, we first consider the computation of tan( r + c ). As +// presented in the previous section, +// +// tan( r + c ) = tan(r) + c * sec^2(r) +// = sgn_r * [ tan(B+x) + CORR ] +// CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)] +// +// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits. +// +// tan( r + c ) = +// / (1/[sin(B)*cos(B)]) * tan(x) +// sgn_r * | tan(B) + -------------------------------- + +// \ cot(B) - tan(x) +// \ +// CORR | + +// / +// +// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Specifically, +// the table values are +// +// tan(B) as T_hi + T_lo; +// cot(B) as C_hi + C_lo; +// 1/[sin(B)*cos(B)] as SC_inv +// +// T_hi, C_hi are in double-precision memory format; +// T_lo, C_lo are in single-precision memory format; +// SC_inv is in extended-precision memory format. +// +// The value of tan(x) will be approximated by a short polynomial of +// the form +// +// tan(x) as x + x * P, where +// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) +// +// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x) +// to a relative accuracy better than 2^(-20). Thus, a good +// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative +// division is: +// +// 1/(cot(B) - tan(x)) is approximately +// 1/(cot(B) - x) is +// tan(B)/(1 - x*tan(B)) is approximately +// T_hi / ( 1 - T_hi * x ) is approximately +// +// T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ] +// +// The calculation of tan(r+c) therefore proceed as follows: +// +// Tx := T_hi * x +// xsq := x * x +// +// V_hi := T_hi*(1 + Tx*(1 + Tx)) +// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) +// ...V_hi serves as an initial guess of 1/(cot(B) - tan(x)) +// ...good to about 20 bits of accuracy +// +// tanx := x + x*P +// D := C_hi - tanx +// ...D is a double precision denominator: cot(B) - tan(x) +// +// V_hi := V_hi + V_hi*(1 - V_hi*D) +// ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits +// +// V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ] +// - V_hi*C_lo ) ...observe all order +// ...V_hi + V_lo approximates 1/(cot(B) - tan(x)) +// ...to extra accuracy +// +// ... SC_inv(B) * (x + x*P) +// ... tan(B) + ------------------------- + CORR +// ... cot(B) - (x + x*P) +// ... +// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// +// Sx := SC_inv * x +// CORR := sgn_r * c * SC_inv * T_hi +// +// ...put the ingredients together to compute +// ... SC_inv(B) * (x + x*P) +// ... tan(B) + ------------------------- + CORR +// ... cot(B) - (x + x*P) +// ... +// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// ... = T_hi + T_lo + CORR + +// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) +// +// CORR := CORR + T_lo +// tail := V_lo + P*(V_hi + V_lo) +// tail := Sx * tail + CORR +// tail := Sx * V_hi + tail +// T_hi := sgn_r * T_hi +// +// ...T_hi + sgn_r*tail now approximate +// ...sgn_r*(tan(B+x) + CORR) accurately +// +// Result := T_hi + sgn_r*tail ...in user-defined +// ...rounding control +// ...It is crucial that independent paths be fully +// ...exploited for performance's sake. +// +// +// Next, we consider the computation of -cot( r + c ). As +// presented in the previous section, +// +// -cot( r + c ) = -cot(r) + c * csc^2(r) +// = sgn_r * [ -cot(B+x) + CORR ] +// CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)] +// +// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits. +// +// -cot( r + c ) = +// / (1/[sin(B)*cos(B)]) * tan(x) +// sgn_r * | -cot(B) + -------------------------------- + +// \ tan(B) + tan(x) +// \ +// CORR | + +// / +// +// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Specifically, +// the table values are +// +// tan(B) as T_hi + T_lo; +// cot(B) as C_hi + C_lo; +// 1/[sin(B)*cos(B)] as SC_inv +// +// T_hi, C_hi are in double-precision memory format; +// T_lo, C_lo are in single-precision memory format; +// SC_inv is in extended-precision memory format. +// +// The value of tan(x) will be approximated by a short polynomial of +// the form +// +// tan(x) as x + x * P, where +// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) +// +// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x) +// to a relative accuracy better than 2^(-18). Thus, a good +// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative +// division is: +// +// 1/(tan(B) + tan(x)) is approximately +// 1/(tan(B) + x) is +// cot(B)/(1 + x*cot(B)) is approximately +// C_hi / ( 1 + C_hi * x ) is approximately +// +// C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ] +// +// The calculation of -cot(r+c) therefore proceed as follows: +// +// Cx := C_hi * x +// xsq := x * x +// +// V_hi := C_hi*(1 - Cx*(1 - Cx)) +// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) +// ...V_hi serves as an initial guess of 1/(tan(B) + tan(x)) +// ...good to about 18 bits of accuracy +// +// tanx := x + x*P +// D := T_hi + tanx +// ...D is a double precision denominator: tan(B) + tan(x) +// +// V_hi := V_hi + V_hi*(1 - V_hi*D) +// ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits +// +// V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ] +// - V_hi*T_lo ) ...observe all order +// ...V_hi + V_lo approximates 1/(tan(B) + tan(x)) +// ...to extra accuracy +// +// ... SC_inv(B) * (x + x*P) +// ... -cot(B) + ------------------------- + CORR +// ... tan(B) + (x + x*P) +// ... +// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// +// Sx := SC_inv * x +// CORR := sgn_r * c * SC_inv * C_hi +// +// ...put the ingredients together to compute +// ... SC_inv(B) * (x + x*P) +// ... -cot(B) + ------------------------- + CORR +// ... tan(B) + (x + x*P) +// ... +// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// ... =-C_hi - C_lo + CORR + +// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) +// +// CORR := CORR - C_lo +// tail := V_lo + P*(V_hi + V_lo) +// tail := Sx * tail + CORR +// tail := Sx * V_hi + tail +// C_hi := -sgn_r * C_hi +// +// ...C_hi + sgn_r*tail now approximates +// ...sgn_r*(-cot(B+x) + CORR) accurately +// +// Result := C_hi + sgn_r*tail in user-defined rounding control +// ...It is crucial that independent paths be fully +// ...exploited for performance's sake. +// +// 3. Implementation Notes +// ======================= +// +// Table entries T_hi, T_lo; C_hi, C_lo; SC_inv +// +// Recall that 2^(-2) <= |r| <= pi/4; +// +// r = sgn_r * 2^k * 1.b_1 b_2 ... b_63 +// +// and +// +// B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1 +// +// Thus, for k = -2, possible values of B are +// +// B = 2^(-2) * ( 1 + index/32 + 1/64 ), +// index ranges from 0 to 31 +// +// For k = -1, however, since |r| <= pi/4 = 0.78... +// possible values of B are +// +// B = 2^(-1) * ( 1 + index/32 + 1/64 ) +// index ranges from 0 to 19. +// +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 128 + +TAN_BASE_CONSTANTS: +ASM_TYPE_DIRECTIVE(TAN_BASE_CONSTANTS,@object) +data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24 + // two**-14, -two**-14 +data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi +data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0 +data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1 +data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2 +data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3 +data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63 +data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0 +data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1 +data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2 +data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4 +data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4 +data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2 +data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33 +data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1 +data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2 +data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3 +data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4 +data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5 +data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6 +data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7 +data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8 +data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9 +data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1 +data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2 +data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3 +data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4 +data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5 +data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6 +data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7 +data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1 +data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2 +data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3 +// +// Entries T_hi double-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// Entries T_lo single-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000 +data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000 +data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000 +data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000 +data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000 +data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000 +data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000 +data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000 +data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000 +data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000 +data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000 +data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000 +data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000 +data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000 +data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000 +data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000 +data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000 +data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000 +data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000 +data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000 +data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000 +data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000 +data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000 +data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000 +data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000 +data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000 +data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000 +data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000 +data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000 +data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000 +data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000 +data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000 +// +// Entries T_hi double-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// Entries T_lo single-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000 +data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000 +data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000 +data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000 +data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000 +data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000 +data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000 +data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000 +data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000 +data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000 +data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000 +data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000 +data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000 +data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000 +data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000 +data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000 +data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000 +data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000 +data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000 +data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000 +// +// Entries C_hi double-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// Entries C_lo single-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000 +data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000 +data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000 +data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000 +data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000 +data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000 +data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000 +data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000 +data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000 +data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000 +data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000 +data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000 +data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000 +data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000 +data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000 +data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000 +data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000 +data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000 +data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000 +data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000 +data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000 +data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000 +data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000 +data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000 +data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000 +data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000 +data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000 +data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000 +data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000 +data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000 +data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000 +data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000 +// +// Entries C_hi double-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// Entries C_lo single-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000 +data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000 +data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000 +data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000 +data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000 +data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000 +data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000 +data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000 +data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000 +data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000 +data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000 +data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000 +data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000 +data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000 +data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000 +data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000 +data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000 +data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000 +data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000 +data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000 +// +// Entries SC_inv in Swapped IEEE format (extended) +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000 +data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000 +data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000 +data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000 +data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000 +data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000 +data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000 +data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000 +data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000 +data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000 +data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000 +data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000 +data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000 +data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000 +data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000 +data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000 +data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000 +data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000 +data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000 +data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000 +data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000 +data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000 +data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000 +data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000 +data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000 +data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000 +data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000 +data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000 +data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000 +data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000 +data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000 +data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000 +// +// Entries SC_inv in Swapped IEEE format (extended) +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000 +data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000 +data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000 +data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000 +data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000 +data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000 +data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000 +data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000 +data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000 +data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000 +data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000 +data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000 +data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000 +data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000 +data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000 +data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000 +data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000 +data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000 +data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000 +data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000 + +Arg = f8 +Result = f8 +fp_tmp = f9 +U_2 = f10 +rsq = f11 +C_hi = f12 +C_lo = f13 +T_hi = f14 +T_lo = f15 + +N_0 = f32 +d_1 = f33 +MPI_BY_4 = f34 +tail = f35 +tanx = f36 +Cx = f37 +Sx = f38 +sgn_r = f39 +CORR = f40 +P = f41 +D = f42 +ArgPrime = f43 +P_0 = f44 + +P2_1 = f45 +P2_2 = f46 +P2_3 = f47 + +P1_1 = f45 +P1_2 = f46 +P1_3 = f47 + +P1_4 = f48 +P1_5 = f49 +P1_6 = f50 +P1_7 = f51 +P1_8 = f52 +P1_9 = f53 + +TWO_TO_63 = f54 +NEGTWO_TO_63 = f55 +x = f56 +xsq = f57 +Tx = f58 +Tx1 = f59 +Set = f60 +poly1 = f61 +poly2 = f62 +Poly = f63 +Poly1 = f64 +Poly2 = f65 +r_to_the_8 = f66 +B = f67 +SC_inv = f68 +Pos_r = f69 +N_0_fix = f70 +PI_BY_4 = f71 +NEGTWO_TO_NEG2 = f72 +TWO_TO_24 = f73 +TWO_TO_NEG14 = f74 +TWO_TO_NEG33 = f75 +NEGTWO_TO_24 = f76 +NEGTWO_TO_NEG14 = f76 +NEGTWO_TO_NEG33 = f77 +two_by_PI = f78 +N = f79 +N_fix = f80 +P_1 = f81 +P_2 = f82 +P_3 = f83 +s_val = f84 +w = f85 +c = f86 +r = f87 +Z = f88 +A = f89 +a = f90 +t = f91 +U_1 = f92 +d_2 = f93 +TWO_TO_NEG2 = f94 +Q1_1 = f95 +Q1_2 = f96 +Q1_3 = f97 +Q1_4 = f98 +Q1_5 = f99 +Q1_6 = f100 +Q1_7 = f101 +Q1_8 = f102 +S_hi = f103 +S_lo = f104 +V_hi = f105 +V_lo = f106 +U_hi = f107 +U_lo = f108 +U_hiabs = f109 +V_hiabs = f110 +V = f111 +Inv_P_0 = f112 + +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_SAVE_PFS = r35 + +delta1 = r36 +table_ptr1 = r37 +table_ptr2 = r38 +i_0 = r39 +i_1 = r40 +N_fix_gr = r41 +N_inc = r42 +exp_Arg = r43 +exp_r = r44 +sig_r = r45 +lookup = r46 +table_offset = r47 +Create_B = r48 +gr_tmp = r49 + +GR_Parameter_X = r49 +GR_Parameter_r = r50 + + + +.global __libm_tan +.section .text +.proc __libm_tan + + +__libm_tan: + +{ .mfi +alloc r32 = ar.pfs, 0,17,2,0 +(p0) fclass.m.unc p6,p0 = Arg, 0x1E7 + addl gr_tmp = -1,r0 +} +;; + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF + nop.i 999 +} +;; + +{ .mfi +(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp + nop.f 999 + nop.i 999 +} +;; + +{ .mmi + ld8 table_ptr1 = [table_ptr1] + setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact + nop.i 999 +} +;; + +// +// Check for NatVals, Infs , NaNs, and Zeros +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan. +// Local table pointer +// + +{ .mbb +(p0) add table_ptr2 = 96, table_ptr1 +(p6) br.cond.spnt __libm_TAN_SPECIAL +(p7) br.cond.spnt __libm_TAN_SPECIAL ;; +} +// +// Point to Inv_P_0 +// Branch out to deal with unsupporteds and special values. +// + +{ .mmf +(p0) ldfs TWO_TO_24 = [table_ptr1],4 +(p0) ldfs TWO_TO_63 = [table_ptr2],4 +// +// Load -2**24, load -2**63. +// +(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;; +} + +{ .mfi +(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12 +(p0) fnorm.s1 Arg = Arg + nop.i 999 +} +// +// Load 2**24, Load 2**63. +// + +{ .mmi +(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;; +// +// Do fcmp to generate Denormal exception +// - can't do FNORM (will generate Underflow when U is unmasked!) +// Normalize input argument. +// +(p0) ldfe two_by_PI = [table_ptr1],16 + nop.i 999 +} + +{ .mmi +(p0) ldfe Inv_P_0 = [table_ptr2],16 ;; +(p0) ldfe d_1 = [table_ptr2],16 + nop.i 999 +} +// +// Decide about the paths to take: +// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2 +// OTHERWISE - CASE 3 OR 4 +// Load inverse of P_0 . +// Set PR_6 if Arg <= -2**63 +// Are there any Infs, NaNs, or zeros? +// + +{ .mmi +(p0) ldfe P_0 = [table_ptr1],16 ;; +(p0) ldfe d_2 = [table_ptr2],16 + nop.i 999 +} +// +// Set PR_8 if Arg <= -2**24 +// Set PR_6 if Arg >= 2**63 +// + +{ .mmi +(p0) ldfe P_1 = [table_ptr1],16 ;; +(p0) ldfe PI_BY_4 = [table_ptr2],16 + nop.i 999 +} +// +// Set PR_8 if Arg >= 2**24 +// + +{ .mmi +(p0) ldfe P_2 = [table_ptr1],16 ;; +(p0) ldfe MPI_BY_4 = [table_ptr2],16 + nop.i 999 +} +// +// Load P_2 and PI_BY_4 +// + +{ .mfi +(p0) ldfe P_3 = [table_ptr1],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Load P_3 and -PI_BY_4 +// +(p6) br.cond.spnt TAN_ARG_TOO_LARGE ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Load 2**(-2). +// Load -2**(-2). +// Branch out if we have a special argument. +// Branch out if the magnitude of the input argument is too large +// - do this branch before the next. +// +(p8) br.cond.spnt TAN_LARGER_ARG ;; +} +// +// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 +// + +{ .mfi +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 +// ARGUMENT REDUCTION CODE - CASE 1 and 2 +// Load 2**(-2). +// Load -2**(-2). +(p0) fmpy.s1 N = Arg,two_by_PI + nop.i 999 ;; +} + +{ .mfi +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12 +// +// N = Arg * 2/pi +// +(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if Arg < pi/4, set PR_8. +// +(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4 + nop.i 999 ;; +} +// +// Case 1: Is |r| < 2**(-2). +// Arg is the same as r in this case. +// r = Arg +// c = 0 +// + +{ .mfi +(p8) mov N_fix_gr = r0 +// +// if Arg > -pi/4, reset PR_8. +// Select the case when |Arg| < pi/4 - set PR[8] = true. +// Else Select the case when |Arg| >= pi/4 - set PR[9] = true. +// +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Grab the integer part of N . +// +(p8) mov r = Arg + nop.i 999 +} + +{ .mfi + nop.m 999 +(p8) mov c = f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 2: Place integer part of N in GP register. +// +(p9) fcvt.xf N = N_fix + nop.i 999 ;; +} + +{ .mib +(p9) getf.sig N_fix_gr = N_fix + nop.i 999 +// +// Case 2: Convert integer N_fix back to normalized floating-point value. +// +(p10) br.cond.spnt TAN_SMALL_R ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.sptk TAN_NORMAL_R ;; +} +// +// Case 1: PR_3 is only affected when PR_1 is set. +// + +{ .mmi +(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;; +// +// Case 2: Load 2**(-33). +// +(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 2: Load -2**(-33). +// +(p9) fnma.s1 s_val = N, P_1, Arg + nop.i 999 +} + +{ .mfi + nop.m 999 +(p9) fmpy.s1 w = N, P_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 2: w = N * P_2 +// Case 2: s_val = -N * P_1 + Arg +// +(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Decide between case_1 and case_2 reduce: +// +(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33) +// Case 2_reduce: -2**(-33) < s < 2**(-33) +// +(p8) fsub.s1 r = s_val, w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p9) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 U_1 = N, P_2, w + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10 +// else set PR_11. +// +(p8) fsub.s1 c = s_val, r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 1_reduce: r = s + w (change sign) +// Case 2_reduce: w = N * P_3 (change sign) +// +(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fsub.s1 r = s_val, U_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 1_reduce: c is complete here. +// c = c + w (w has not been negated.) +// Case 2_reduce: r is complete here - continue to calculate c . +// r = s - U_1 +// +(p9) fms.s1 U_2 = N, P_2, U_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 1_reduce: c = s - r +// Case 2_reduce: U_1 = N * P_2 + w +// +(p8) fsub.s1 c = c, w + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fsub.s1 s_val = s_val, r + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// Case 2_reduce: +// U_2 = N * P_2 - U_1 +// Not needed until later. +// +(p9) fadd.s1 U_2 = U_2, w +// +// Case 2_reduce: +// s = s - r +// U_2 = U_2 + w +// +(p10) br.cond.spnt TAN_SMALL_R ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p11) br.cond.sptk TAN_NORMAL_R ;; +} + +{ .mii + nop.m 999 +// +// Case 2_reduce: +// c = c - U_2 +// c is complete here +// Argument reduction ends here. +// +(p9) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;; +} + +{ .mfi + nop.m 999 +// +// Is i_1 even or odd? +// if i_1 == 0, set p11, else set p12. +// +(p11) fmpy.s1 rsq = r, Z + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) frcpa.s1 S_hi,p0 = f1, r + nop.i 999 +} + +// +// Case 1: Branch to SMALL_R or NORMAL_R. +// Case 1 is done now. +// + +{ .mfi +(p9) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp +(p9) fsub.s1 c = s_val, U_1 + nop.i 999 ;; +} +;; + +{ .mmi +(p9) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + +{ .mmi +(p9) add table_ptr1 = 224, table_ptr1 ;; +(p9) ldfe P1_1 = [table_ptr1],144 + nop.i 999 ;; +} +// +// Get [i_1] - lsb of N_fix_gr . +// Load P1_1 and point to Q1_1 . +// + +{ .mfi +(p9) ldfe Q1_1 = [table_ptr1] , 0 +// +// N even: rsq = r * Z +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 2_reduce: +// c = s - U_1 +// +(p9) fsub.s1 c = c, U_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: Change sign of S_hi +// +(p11) fmpy.s1 rsq = rsq, P1_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: rsq = rsq * P1_1 +// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary +// +(p11) fma.s1 Result = r, rsq, c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = c + r * rsq +// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = Result + r +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// +(p11) fadd.s0 Result = r, Result + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result1 = Result + r +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * poly + 1.0 64 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: S_lo = S_hi * poly1 +// +(p12) fma.s1 S_lo = Q1_1, r, S_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N odd: Result = S_hi + S_lo +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// N odd: S_lo = S_lo + Q1_1 * r +// +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; +} + + +TAN_LARGER_ARG: + +{ .mmf +(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp + nop.m 999 +(p0) fmpy.s1 N_0 = Arg, Inv_P_0 +} +;; + +// +// ARGUMENT REDUCTION CODE - CASE 3 and 4 +// +// +// Adjust table_ptr1 to beginning of table. +// N_0 = Arg * Inv_P_0 +// + + +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi +(p0) add table_ptr1 = 8, table_ptr1 ;; +// +// Point to 2*-14 +// +(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4 + nop.i 999 ;; +} +// +// Load 2**(-14). +// + +{ .mmi +(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;; +// +// N_0_fix = integer part of N_0 . +// Adjust table_ptr1 to beginning of table. +// +(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4 + nop.i 999 ;; +} +// +// Make N_0 the integer part. +// + +{ .mfi +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1] +// +// Load -2**(-14). +// +(p0) fcvt.fx.s1 N_0_fix = N_0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf N_0 = N_0_fix + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 ArgPrime = N_0, P_0, Arg + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 w = N_0, d_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// ArgPrime = -N_0 * P_0 + Arg +// w = N_0 * d_1 +// +(p0) fmpy.s1 N = ArgPrime, two_by_PI + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N = ArgPrime * 2/pi +// +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N_fix is the integer part. +// +(p0) fcvt.xf N = N_fix + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig N_fix_gr = N_fix + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N is the integer part of the reduced-reduced argument. +// Put the integer in a GP register. +// +(p0) fnma.s1 s_val = N, P_1, ArgPrime + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fnma.s1 w = N, P_2, w + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// s_val = -N*P_1 + ArgPrime +// w = -N*P_2 + w +// +(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p10) fmpy.s1 V_hi = N, P_2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p11) fmpy.s1 V_hi = N, P_2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Decide between case 3 and 4: +// Case 3: s <= -2**(-14) or s >= 2**(-14) +// Case 4: -2**(-14) < s < 2**(-14) +// +(p10) fadd.s1 r = s_val, w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: We need abs of both U_hi and V_hi - dont +// worry about switched sign of V_hi . +// +(p11) fsub.s1 A = U_hi, V_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 4: A = U_hi + V_hi +// Note: Worry about switched sign of V_hi, so subtract instead of add. +// +(p11) fnma.s1 V_lo = N, P_2, V_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fms.s1 U_lo = N_0, d_1, U_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fabs V_hiabs = V_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +(p10) fadd.s1 r = s_val, w + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: c = s_val - r +// Case 4: U_lo = N_0 * d_1 - U_hi +// +(p11) fabs U_hiabs = U_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: Set P_12 if U_hiabs >= V_hiabs +// +(p11) fadd.s1 C_hi = s_val, A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: C_hi = s_val + A +// +(p11) fadd.s1 t = U_lo, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: Is |r| < 2**(-2), if so set PR_7 +// else set PR_8. +// Case 3: If PR_7 is set, prepare to branch to Small_R. +// Case 3: If PR_8 is set, prepare to branch to Normal_R. +// +(p10) fsub.s1 c = s_val, r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: c = (s - r) + w (c complete) +// +(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fms.s1 w = N_0, d_2, w + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +// +(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_hi . +// Case 4: w = w + N_0 * d_2 +// Note: the (-) is now incorporated in w . +// +(p10) fadd.s1 c = c, w +// +// Case 4: t = U_lo + V_lo +// Note: remember V_lo should be (-), subtract instead of add. NO +// +(p14) br.cond.spnt TAN_SMALL_R ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p15) br.cond.spnt TAN_NORMAL_R ;; +} + +{ .mfi + nop.m 999 +// +// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true. +// The remaining stuff is for Case 4. +// +(p12) fsub.s1 a = U_hi, A +(p11) extr.u i_1 = N_fix_gr, 0, 1 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: C_lo = s_val - C_hi +// +(p11) fadd.s1 t = t, w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p13) fadd.s1 a = V_hi, A + nop.i 999 ;; +} + +// +// Case 4: a = U_hi - A +// a = V_hi - A (do an add to account for missing (-) on V_hi +// + +{ .mfi +(p11) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp +(p11) fsub.s1 C_lo = s_val, C_hi + nop.i 999 +} +;; + +{ .mmi +(p11) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + +// +// Case 4: a = (U_hi - A) + V_hi +// a = (V_hi - A) + U_hi +// In each case account for negative missing form V_hi . +// +// +// Case 4: C_lo = (s_val - C_hi) + A +// + +{ .mmi +(p11) add table_ptr1 = 224, table_ptr1 ;; +(p11) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} + +{ .mfi +(p11) ldfe P1_2 = [table_ptr1], 128 +// +// Case 4: w = U_lo + V_lo + w +// +(p12) fsub.s1 a = a, V_hi + nop.i 999 ;; +} +// +// Case 4: r = C_hi + C_lo +// + +{ .mfi +(p11) ldfe Q1_1 = [table_ptr1], 16 +(p11) fadd.s1 C_lo = C_lo, A + nop.i 999 ;; +} +// +// Case 4: c = C_hi - r +// Get [i_1] - lsb of N_fix_gr. +// + +{ .mfi +(p11) ldfe Q1_2 = [table_ptr1], 16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fsub.s1 a = U_hi, a + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd.s1 t = t, a + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: t = t + a +// +(p11) fadd.s1 C_lo = C_lo, t + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Case 4: C_lo = C_lo + t +// +(p11) fadd.s1 r = C_hi, C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fsub.s1 c = C_hi, r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Case 4: c = c + C_lo finished. +// Is i_1 even or odd? +// if i_1 == 0, set PR_4, else set PR_5. +// +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd.s1 c = c , C_lo +(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; +} + +{ .mfi + nop.m 999 +(p12) frcpa.s1 S_hi, p0 = f1, r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N odd: Change sign of S_hi +// +(p11) fma.s1 Result = rsq, P1_2, P1_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 P = rsq, Q1_2, Q1_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N odd: Result = S_hi + S_lo (User supplied rounding mode for C1) +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: rsq = r * r +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N even: rsq = rsq * P1_2 + P1_1 +// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary +// +(p11) fmpy.s1 Result = rsq, Result + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r,f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N even: Result = Result * rsq +// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary +// +(p11) fma.s1 Result = r, Result, c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// +(p11) fadd.s0 Result= r, Result + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = Result * r + c +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result1 = Result + r (Rounding mode S0) +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * poly + S_hi 64 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: S_lo = S_hi * poly1 +// +(p12) fma.s1 S_lo = P, r, S_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// N odd: S_lo = S_lo + r * P +// +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; +} + + +TAN_SMALL_R: + +{ .mii + nop.m 999 +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) frcpa.s1 S_hi, p0 = f1, r + nop.i 999 +} + +{ .mfi +(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp + nop.f 999 + nop.i 999 +} +;; + +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + +// ***************************************************************** +// ***************************************************************** +// ***************************************************************** + +{ .mmi +(p0) add table_ptr1 = 224, table_ptr1 ;; +(p0) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-2) + +{ .mfi +(p0) ldfe P1_2 = [table_ptr1], 16 +(p11) fmpy.s1 r_to_the_8 = rsq, rsq + nop.i 999 ;; +} +// +// Set table_ptr1 to beginning of constant table. +// Get [i_1] - lsb of N_fix_gr. +// + +{ .mfi +(p0) ldfe P1_3 = [table_ptr1], 96 +// +// N even: rsq = r * r +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 ;; +} +// +// Is i_1 even or odd? +// if i_1 == 0, set PR_11. +// if i_1 != 0, set PR_12. +// + +{ .mfi +(p11) ldfe P1_9 = [table_ptr1], -16 +// +// N even: Poly2 = P1_7 + Poly2 * rsq +// N odd: poly2 = Q1_5 + poly2 * rsq +// +(p11) fadd.s1 CORR = rsq, f1 + nop.i 999 ;; +} + +{ .mmi +(p11) ldfe P1_8 = [table_ptr1], -16 ;; +// +// N even: Poly1 = P1_2 + P1_3 * rsq +// N odd: poly1 = 1.0 + S_hi * r +// 16 bits partial account for necessary (-1) +// +(p11) ldfe P1_7 = [table_ptr1], -16 + nop.i 999 ;; +} +// +// N even: Poly1 = P1_1 + Poly1 * rsq +// N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary +// + +{ .mfi +(p11) ldfe P1_6 = [table_ptr1], -16 +// +// N even: Poly2 = P1_5 + Poly2 * rsq +// N odd: poly2 = Q1_3 + poly2 * rsq +// +(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8 + nop.i 999 ;; +} +// +// N even: Poly1 = Poly1 * rsq +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// + +{ .mfi +(p11) ldfe P1_5 = [table_ptr1], -16 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +// +// N even: CORR = CORR * c +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// + +// +// N even: Poly2 = P1_6 + Poly2 * rsq +// N odd: poly2 = Q1_4 + poly2 * rsq +// +{ .mmf +(p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp +(p11) ldfe P1_4 = [table_ptr1], -16 +(p11) fmpy.s1 CORR = CORR, c +} +;; + + +{ .mmi +(p0) ld8 table_ptr2 = [table_ptr2] + nop.m 999 + nop.i 999 +} +;; + + +{ .mii +(p0) add table_ptr2 = 464, table_ptr2 + nop.i 999 ;; + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fma.s1 Poly1 = P1_3, rsq, P1_2 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe Q1_7 = [table_ptr2], -16 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe Q1_6 = [table_ptr2], -16 +(p11) fma.s1 Poly2 = P1_9, rsq, P1_8 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfe Q1_5 = [table_ptr2], -16 ;; +(p12) ldfe Q1_4 = [table_ptr2], -16 + nop.i 999 ;; +} + +{ .mfi +(p12) ldfe Q1_3 = [table_ptr2], -16 +// +// N even: Poly2 = P1_8 + P1_9 * rsq +// N odd: poly2 = Q1_6 + Q1_7 * rsq +// +(p11) fma.s1 Poly1 = Poly1, rsq, P1_1 + nop.i 999 ;; +} + +{ .mfi +(p12) ldfe Q1_2 = [table_ptr2], -16 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +{ .mfi +(p12) ldfe Q1_1 = [table_ptr2], -16 +(p11) fma.s1 Poly2 = Poly2, rsq, P1_7 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: CORR = rsq + 1 +// N even: r_to_the_8 = rsq * rsq +// +(p11) fmpy.s1 Poly1 = Poly1, rsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fma.s1 Poly2 = Poly2, rsq, P1_6 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fma.s1 Poly2= Poly2, rsq, P1_5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: r_to_the_8 = r_to_the_8 * r_to_the_8 +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p11) fma.s1 Poly2 = Poly2, rsq, P1_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = CORR + Poly * r +// N odd: P = Q1_1 + poly2 * rsq +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Poly2 = P1_4 + Poly2 * rsq +// N odd: poly2 = Q1_2 + poly2 * rsq +// +(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Poly = Poly1 + Poly2 * r_to_the_8 +// N odd: S_hi = S_hi * poly1 + S_hi 64 bits +// +(p11) fma.s1 Result = Poly, r, CORR + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = r + Result (User supplied rounding mode) +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fma.s1 P = poly2, rsq, Q1_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +(p11) fadd.s0 Result = Result, r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: S_lo = S_hi * poly1 +// +(p12) fma.s1 S_lo = Q1_1, c, S_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N odd: Result = Result + S_hi (user supplied rounding mode) +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: S_lo = Q1_1 * c + S_lo +// +(p12) fma.s1 Result = P, r, S_lo + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// N odd: Result = S_lo + r * P +// +(p12) fadd.s0 Result = Result, S_hi +(p0) br.ret.sptk b0 ;; +} + + +TAN_NORMAL_R: + +{ .mfi +(p0) getf.sig sig_r = r +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// r and c have been computed. +// Make sure ftz mode is set - should be automatic when using wre +// +// +// Get [i_1] - lsb of N_fix_gr alone. +// +(p0) fmerge.s Pos_r = f1, r +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s sgn_r = r, f1 +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; +} + +{ .mfi + nop.m 999 + nop.f 999 +(p0) extr.u lookup = sig_r, 58, 5 +} + +{ .mlx + nop.m 999 +(p0) movl Create_B = 0x8200000000000000 ;; +} + +{ .mfi +(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp + nop.f 999 +(p0) dep Create_B = lookup, Create_B, 58, 5 +} +;; + +// +// Get [i_1] - lsb of N_fix_gr alone. +// Pos_r = abs (r) +// + + +{ .mmi + ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi + nop.m 999 +(p0) setf.sig B = Create_B +// +// Set table_ptr1 and table_ptr2 to base address of +// constant table. +// +(p0) add table_ptr1 = 480, table_ptr1 ;; +} + +{ .mmb + nop.m 999 +// +// Is i_1 or i_0 == 0 ? +// Create the constant 1 00000 1000000000000000000000... +// +(p0) ldfe P2_1 = [table_ptr1], 16 + nop.b 999 +} + +{ .mmi + nop.m 999 ;; +(p0) getf.exp exp_r = Pos_r + nop.i 999 +} +// +// Get r's exponent +// Get r's significand +// + +{ .mmi +(p0) ldfe P2_2 = [table_ptr1], 16 ;; +// +// Get the 5 bits or r for the lookup. 1.xxxxx .... +// from sig_r. +// Grab lsb of exp of B +// +(p0) ldfe P2_3 = [table_ptr1], 16 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) andcm table_offset = 0x0001, exp_r ;; +(p0) shl table_offset = table_offset, 9 ;; +} + +{ .mii + nop.m 999 +// +// Deposit 0 00000 1000000000000000000000... on +// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy..., +// getting rid of the ys. +// Is B = 2** -2 or B= 2** -1? If 2**-1, then +// we want an offset of 512 for table addressing. +// +(p0) shladd table_offset = lookup, 4, table_offset ;; +// +// B = ........ 1xxxxx 1000000000000000000... +// +(p0) add table_ptr1 = table_ptr1, table_offset ;; +} + +{ .mmb + nop.m 999 +// +// B = ........ 1xxxxx 1000000000000000000... +// Convert B so it has the same exponent as Pos_r +// +(p0) ldfd T_hi = [table_ptr1], 8 + nop.b 999 ;; +} + +// +// x = |r| - B +// Load T_hi. +// Load C_hi. +// + +{ .mmf +(p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp +(p0) ldfs T_lo = [table_ptr1] +(p0) fmerge.se B = Pos_r, B +} +;; + +{ .mmi + ld8 table_ptr2 = [table_ptr2] + nop.m 999 + nop.i 999 +} +;; + +{ .mii +(p0) add table_ptr2 = 1360, table_ptr2 + nop.i 999 ;; +(p0) add table_ptr2 = table_ptr2, table_offset ;; +} + +{ .mfi +(p0) ldfd C_hi = [table_ptr2], 8 +(p0) fsub.s1 x = Pos_r, B + nop.i 999 ;; +} + +{ .mii +(p0) ldfs C_lo = [table_ptr2],255 + nop.i 999 ;; +// +// xsq = x * x +// N even: Tx = T_hi * x +// Load T_lo. +// Load C_lo - increment pointer to get SC_inv +// - cant get all the way, do an add later. +// +(p0) add table_ptr2 = 569, table_ptr2 ;; +} +// +// N even: Tx1 = Tx + 1 +// N odd: Cx1 = 1 - Cx +// + +{ .mfi +(p0) ldfe SC_inv = [table_ptr2], 0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 xsq = x, x + nop.i 999 +} + +{ .mfi + nop.m 999 +(p11) fmpy.s1 Tx = T_hi, x + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fmpy.s1 Cx = C_hi, x + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: Cx = C_hi * x +// +(p0) fma.s1 P = P2_3, xsq, P2_2 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N even and odd: P = P2_3 + P2_2 * xsq +// +(p11) fadd.s1 Tx1 = Tx, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: D = C_hi - tanx +// N odd: D = T_hi + tanx +// +(p11) fmpy.s1 CORR = SC_inv, T_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 Sx = SC_inv, x + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fmpy.s1 CORR = SC_inv, C_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fsub.s1 V_hi = f1, Cx + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 P = P, xsq, P2_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N even and odd: P = P2_1 + P * xsq +// +(p11) fma.s1 V_hi = Tx, Tx1, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Result = sgn_r * tail + T_hi (user rounding mode for C1) +// N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1) +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 CORR = CORR, c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fnma.s1 V_hi = Cx,V_hi,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: V_hi = Tx * Tx1 + 1 +// N odd: Cx1 = 1 - Cx * Cx1 +// +(p0) fmpy.s1 P = P, xsq + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// N even and odd: P = P * xsq +// +(p11) fmpy.s1 V_hi = V_hi, T_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: tail = P * tail + V_lo +// +(p11) fmpy.s1 T_hi = sgn_r, T_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 CORR = CORR, sgn_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fmpy.s1 V_hi = V_hi,C_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: V_hi = T_hi * V_hi +// N odd: V_hi = C_hi * V_hi +// +(p0) fma.s1 tanx = P, x, x + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fnmpy.s1 C_hi = sgn_r, C_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: V_lo = 1 - V_hi + C_hi +// N odd: V_lo = 1 - V_hi + T_hi +// +(p11) fadd.s1 CORR = CORR, T_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fsub.s1 CORR = CORR, C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: tanx = x + x * P +// N even and odd: Sx = SC_inv * x +// +(p11) fsub.s1 D = C_hi, tanx + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fadd.s1 D = T_hi, tanx + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N odd: CORR = SC_inv * C_hi +// N even: CORR = SC_inv * T_hi +// +(p0) fnma.s1 D = V_hi, D, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: D = 1 - V_hi * D +// N even and odd: CORR = CORR * c +// +(p0) fma.s1 V_hi = V_hi, D, V_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: V_hi = V_hi + V_hi * D +// N even and odd: CORR = sgn_r * CORR +// +(p11) fnma.s1 V_lo = V_hi, C_hi, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = V_hi, T_hi, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: CORR = COOR + T_lo +// N odd: CORR = CORR - C_lo +// +(p11) fma.s1 V_lo = tanx, V_hi, V_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = tanx, V_hi, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: V_lo = V_lo + V_hi * tanx +// N odd: V_lo = V_lo - V_hi * tanx +// +(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: V_lo = V_lo - V_hi * C_lo +// N odd: V_lo = V_lo - V_hi * T_lo +// +(p0) fmpy.s1 V_lo = V_hi, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: V_lo = V_lo * V_hi +// +(p0) fadd.s1 tail = V_hi, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: tail = V_hi + V_lo +// +(p0) fma.s1 tail = tail, P, V_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: T_hi = sgn_r * T_hi +// N odd : C_hi = -sgn_r * C_hi +// +(p0) fma.s1 tail = tail, Sx, CORR + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even and odd: tail = Sx * tail + CORR +// +(p0) fma.s1 tail = V_hi, Sx, tail + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even an odd: tail = Sx * V_hi + tail +// +(p11) fma.s0 Result = sgn_r, tail, T_hi + nop.i 999 +} + +{ .mfb + nop.m 999 +(p12) fma.s0 Result = sgn_r, tail, C_hi +(p0) br.ret.sptk b0 ;; +} + +.endp __libm_tan +ASM_SIZE_DIRECTIVE(__libm_tan) + + + +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// Special Code to handle very large argument case. +// Call int pi_by_2_reduce(&x,&r) +// for |arguments| >= 2**63 +// (Arg or x) is in f8 +// Address to save r and c as double + +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | r50 ->| <- r50 f0 ->| r50 -> | -> c +// | | | | +// sp-32 -> | <- r50 f0 ->| f0 ->| <- r50 r49 -> | -> r +// | | | | +// | r49 ->| <- r49 Arg ->| <- r49 | -> x +// | | | | +// sp -64 ->| sp -64 ->| sp -64 ->| | +// +// save pfs save b0 restore gp +// save gp restore b0 +// restore pfs + + + +.proc __libm_callout +__libm_callout: +TAN_ARG_TOO_LARGE: +.prologue +// (1) +{ .mfi + add GR_Parameter_r =-32,sp // Parameter: r address + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + +// (2) +{ .mmi + stfe [GR_Parameter_r ] = f0,16 // Clear Parameter r on stack + add GR_Parameter_X = 16,sp // Parameter x address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +// (3) +.body +{ .mib + stfe [GR_Parameter_r ] = f0,-16 // Clear Parameter c on stack + nop.i 0 + nop.b 0 +} +{ .mib + stfe [GR_Parameter_X] = Arg // Store Parameter x on stack + nop.i 0 +(p0) br.call.sptk b0=__libm_pi_by_2_reduce# +} +;; + + +// (4) +{ .mmi + mov gp = GR_SAVE_GP // Restore gp +(p0) mov N_fix_gr = r8 + nop.i 999 +} +;; + +{ .mmi +(p0) ldfe Arg =[GR_Parameter_X],16 +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 + nop.i 999 +} +;; + + +{ .mmb +(p0) ldfe r =[GR_Parameter_r ],16 +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4 + nop.b 999 ;; +} + +{ .mfi +(p0) ldfe c =[GR_Parameter_r ] + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Is |r| < 2**(-2) +// +(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2 + mov b0 = GR_SAVE_B0 // Restore return address +} +;; + +{ .mfi + nop.m 999 +(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs +} +;; + +{ .mbb +.restore sp + add sp = 64,sp // Restore stack pointer +(p6) br.cond.spnt TAN_SMALL_R +(p0) br.cond.sptk TAN_NORMAL_R +} +;; +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + + +.proc __libm_TAN_SPECIAL +__libm_TAN_SPECIAL: + +// +// Code for NaNs, Unsupporteds, Infs, or +/- zero ? +// Invalid raised for Infs and SNaNs. +// + +{ .mfb + nop.m 999 +(p0) fmpy.s0 Arg = Arg, f0 +(p0) br.ret.sptk b0 +} +.endp __libm_TAN_SPECIAL +ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL) + + +.type __libm_pi_by_2_reduce#,@function +.global __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/s_atan.S b/sysdeps/ia64/fpu/s_atan.S new file mode 100644 index 0000000000..e3a5c85f2a --- /dev/null +++ b/sysdeps/ia64/fpu/s_atan.S @@ -0,0 +1,953 @@ +.file "atan.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/13/00: Improved speed +// 4/19/00: Removed the qualifying predicate from the fmerge.s that +// takes the absolute value. +// 6/16/00: Reassigned FP registers to eliminate stalls on loads +// 8/30/00: Saved 5 cycles in main path by rearranging large argument logic +// and delaying use of result of fcmp in load by 1 group +// +// API +//============================================================== +// double atan( double x); +// +// Overview of operation +//============================================================== +// atan(x) = sign(X)pi/2 - atan(1/x) +// +// We have two paths: |x| > 1 and |x| <= 1 +// +// |x| > 1 +// ========================================== +// +// c = frcpa(x) which is approximately 1/x +// +// xc = 1- B +// B = 1-xc +// +// Approximate 1/(1-B)^k by a polynomial in B, poly(B) +// k is 45. +// +// poly(B) = 1 + r1 B + r2 B^2 + ...+ r10 B^10 +// +// c^k = (1-B)^k/x^k +// c^k/(1-B)^k = 1/x^k +// c^k poly(B) = 1/x^k + +// poly(x) = series(atan(1/x)) = 1/x - 1/3x^3 + 1/5x^5 - 1/7x^7 .... + 1/45 x^45 +// = 1/x^45 ( x^44 - x^42/3 + x^40/5 - x^38/7 ... +1) +// = 1/x^45 ( y^22 - y^21/3 + y^20/5 - y^19/7 ... +1) +// +// = c^45 poly(B) poly(x) +// = c^45 r(B) q(y) + +// q(y) = q0 + q1 y + q2 y^2 + ... + q22 y^22 +// where q22 is 1.0 + +// atan(x) = sign(X)pi/2 - c^45 r(B) q(y) + +// |x| <= 1 +// ========================================== +// poly(x) = series(atan(x)) = x - x^3/3 + x^5/5 + ..... +// poly(x) = series(atan(x)) = x + x^3(- 1/3 + x^2/5 + ..... +x^47/47) +// poly(x) = series(atan(x)) = x + x^3(p0 + x^2/5 + ..... + x^44/47) +// poly(x) = series(atan(x)) = x + x^3(p0 + y/5 + ..... + y^22/47) + +// where p0 is about -1/3. + +// atan(x) = poly(x) + +#include "libm_support.h" + +// Special Values +//============================================================== +// atan(QNAN) = QNAN +// atan(SNAN) = quieted SNAN +// atan(+-inf) = +- pi/2 +// atan(+-0) = +-0 + + + +// Registers used +//============================================================== + +// predicate registers used: +// p6 -> p11 + +// floating-point registers used: +// f32 -> f127 + +// general registers used +// r32 -> r37 + +// Assembly macros +//============================================================== +atan_Pi_by_2 = f32 +atan_S_PI = f33 +atan_ABS_f8 = f34 + +atan_R0 = f35 +atan_R1 = f36 +atan_R2 = f37 +atan_R3 = f38 +atan_R4 = f39 +atan_R5 = f40 +atan_R6 = f41 +atan_R7 = f42 +atan_R8 = f43 +atan_R9 = f44 +atan_R10 = f45 + +atan_Q0 = f46 + +atan_Q1 = f47 +atan_Q2 = f48 +atan_Q3 = f49 +atan_Q4 = f50 +atan_Q5 = f51 +atan_Q6 = f52 +atan_Q7 = f53 +atan_Q8 = f54 +atan_Q9 = f55 +atan_Q10 = f56 + +atan_Q11 = f57 +atan_Q12 = f58 +atan_Q13 = f59 +atan_Q14 = f60 +atan_Q15 = f61 +atan_Q16 = f62 +atan_Q17 = f63 +atan_Q18 = f64 +atan_Q19 = f65 +atan_Q20 = f66 +atan_Q21 = f67 +atan_Q22 = f68 + +// P and Q constants are mutually exclusive +// so they can share macro definitions +atan_P0 = f46 + +atan_P1 = f47 +atan_P2 = f48 +atan_P3 = f49 +atan_P4 = f10 +atan_P5 = f11 +atan_P6 = f12 +atan_P7 = f13 +atan_P10 = f103 + +atan_P11 = f114 +atan_P12 = f58 +atan_P13 = f59 +atan_P14 = f60 +atan_P15 = f61 +atan_P16 = f62 +atan_P17 = f63 +atan_P18 = f64 +atan_P19 = f65 +atan_P20 = f14 +atan_P21 = f99 +atan_P22 = f68 +// end of P constant macros + +atan_C = f69 +atan_Y = f70 +atan_B = f71 +atan_Z = f72 +atan_V11 = f73 +atan_V12 = f74 + +atan_V7 = f75 +atan_V8 = f76 + +atan_W13 = f77 +atan_W11 = f78 + +atan_V3 = f79 +atan_V4 = f80 + +atan_G11 = f81 +atan_G12 = f82 +atan_G7 = f83 +atan_G8 = f84 + +atan_Z1 = f85 +atan_W7 = f86 + +atan_G3 = f87 +atan_W8 = f88 +atan_V9 = f89 +atan_V10 = f90 + +atan_G10 = f91 +atan_W3 = f92 +atan_G4 = f93 +atan_G9 = f94 + +atan_G6 = f95 +atan_W4 = f96 +atan_Z2 = f97 +atan_V6 = f98 + +atan_V2 = f99 +atan_W6 = f100 +atan_W10 = f101 +atan_Y3 = f102 + +atan_G2 = f103 + +atan_Y8 = f104 + +atan_G5 = f105 +atan_Z3 = f106 +atan_Z4 = f107 +atan_W2 = f108 +atan_V5 = f109 + +atan_W5 = f110 +atan_G1 = f111 +atan_Y11 = f112 + +atan_Z5 = f113 +atan_Z6 = f114 +atan_V1 = f115 +atan_W1 = f116 + +atan_Z7 = f117 +atan_Q = f118 +atan_Z = f119 +atan_abs_f8 = f120 + +atan_V13 = f121 +atan_Xcub = f122 +atan_Y12 = f123 +atan_P = f124 + +atan_NORM_f8 = f125 + +atan_P8 = f126 +atan_P9 = f127 + + + + +atan_GR_AD_R = r14 +atan_GR_AD_Q = r15 +atan_GR_AD_P = r16 +atan_GR_10172 = r17 +atan_GR_exp_f8 = r18 +atan_GR_signexp_f8 = r19 +atan_GR_exp_mask = r20 + + + + +///////////////////////////////////////////////////////////// + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +double_atan_constants_R: +ASM_TYPE_DIRECTIVE(double_atan_constants_R,@object) + data8 0xB36B46B9C5443CED, 0x0000401C //R8 + data8 0x842633E0D126261F, 0x0000401F //R9 + data8 0xBE04FFFFFFFF46E0, 0x00004010 //R4 + data8 0xE8C62000244D66E2, 0x00004013 //R5 + data8 0xF2790C001E3789B3, 0x00004016 //R6 + data8 0xDCD2CCF97D7C764F, 0x00004019 //R7 + data8 0xB40000000000000B, 0x00004004 //R1 + data8 0xB265F3D38F5EE28F, 0x00004021 //R10 + data8 0x8160000000000001, 0x00004009 //R2 + data8 0xFD5BFFFFFFFE55CD, 0x0000400C //R3 + data8 0xC90FDAA22168C235, 0x00003FFF // pi/2 +ASM_SIZE_DIRECTIVE(double_atan_constants_R) + +double_atan_constants_Q: +ASM_TYPE_DIRECTIVE(double_atan_constants_Q,@object) + data8 0xEBD602FA7761BC33, 0x00003FF9 //Q8 + data8 0x8CB1CABD6A91913C, 0x0000BFFA //Q9 + data8 0x84C665C37D623CD2, 0x00003FF7 //Q4 + data8 0x8DE0D1673DAEA9BC, 0x0000BFF8 //Q5 + data8 0xF658ADBE2C6E6FCC, 0x00003FF8 //Q6 + + data8 0xB56307BE1DD3FFB6, 0x0000BFF9 //Q7 + data8 0xAAAAAAAAAAAA8000, 0x0000BFFD //Q21 + data8 0x8000000000000000, 0x00003FFF //Q22 + data8 0x924924923A9D710C, 0x0000BFFC //Q19 + data8 0xCCCCCCCCCC9380E7, 0x00003FFC //Q20 + + data8 0xA644DC250EFA2800, 0x00003FED //Q0 + data8 0x83DEAE24EEBF5E44, 0x0000BFF1 //Q1 + data8 0xC758CCC64793D4EC, 0x00003FF3 //Q2 + data8 0xBFDC0B54E7C89DCE, 0x0000BFF5 //Q3 + data8 0x888855199D1290AF, 0x0000BFFB //Q15 + + data8 0x9D89D3BE514B0178, 0x00003FFB //Q16 + data8 0xBA2E8B4DEC70282A, 0x0000BFFB //Q17 + data8 0xE38E38DF9E9FC83B, 0x00003FFB //Q18 + data8 0x9F8781CC990029D9, 0x00003FFA //Q10 + data8 0xB0B39472DEBA3C79, 0x0000BFFA //Q11 + + data8 0xC2AFAEF8C85B0BC6, 0x00003FFA //Q12 + data8 0xD780E539797525DD, 0x0000BFFA //Q13 + data8 0xF0EDC449AC786DF9, 0x00003FFA //Q14 +ASM_SIZE_DIRECTIVE(double_atan_constants_Q) + + + +double_atan_constants_P: +ASM_TYPE_DIRECTIVE(double_atan_constants_P,@object) + data8 0xB1899EC590CDB8DF, 0x0000BFFA //P10 + data8 0xA1E79850A67D59B0, 0x00003FFA //P11 + data8 0x911D8B30C2A96E6D, 0x0000BFF3 //P20 + data8 0xB87233C68A640706, 0x00003FF0 //P21 + data8 0xD78E4B82F3C29D7A, 0x0000BFFA //P8 + + data8 0xC2EBE37AF932C14F, 0x00003FFA //P9 + data8 0xBA2E8B94AA104DD6, 0x0000BFFB //P4 + data8 0x9D89D7A640B71D38, 0x00003FFB //P5 + data8 0x88887CA2CE9B2A40, 0x0000BFFB //P6 + data8 0xF0F017D57A919C1E, 0x00003FFA //P7 + + data8 0xD0D635F230C80E06, 0x0000BFF8 //P16 + data8 0xE847BECA7209B479, 0x00003FF7 //P17 + data8 0xD14C6A2AAE0D5B07, 0x0000BFF6 //P18 + data8 0x915F612A5C469117, 0x00003FF5 //P19 + data8 0x921EDE5FD0DBBBE2, 0x0000BFFA //P12 + + data8 0xFFD303C2C8535445, 0x00003FF9 //P13 + data8 0xD30DF50E295386F7, 0x0000BFF9 //P14 + data8 0x9E81F2B1BBD210A8, 0x00003FF9 //P15 + data8 0xAAAAAAAAAAAAA800, 0x0000BFFD //P0 + data8 0xCCCCCCCCCCC7D476, 0x00003FFC //P1 + + data8 0x9249249247838066, 0x0000BFFC //P2 + data8 0xE38E38E302290D68, 0x00003FFB //P3 + data8 0xDF7F0A816F7E5025, 0x0000BFEC //P22 +ASM_SIZE_DIRECTIVE(double_atan_constants_P) + + +.align 32 +.global atan# + +//////////////////////////////////////////////////////// + + + +.section .text +.proc atan# +.align 32 + +atan: + +{ .mmf +(p0) addl atan_GR_AD_P = @ltoff(double_atan_constants_P), gp +(p0) addl atan_GR_AD_Q = @ltoff(double_atan_constants_Q), gp +(p0) fmerge.s atan_ABS_f8 = f0,f8 +} +;; + +{ .mmf + ld8 atan_GR_AD_P = [atan_GR_AD_P] + ld8 atan_GR_AD_Q = [atan_GR_AD_Q] +(p0) frcpa.s1 atan_C,p8 = f1,f8 +} +;; + +{ .mmf +(p0) addl atan_GR_AD_R = @ltoff(double_atan_constants_R), gp +(p0) addl atan_GR_exp_mask = 0x1ffff, r0 +(p0) fma.s1 atan_Y = f8,f8,f0 +} +;; + +// This fnorm takes faults or sets fault flags +{ .mmf +(p0) mov atan_GR_10172 = 0x10172 + ld8 atan_GR_AD_R = [atan_GR_AD_R] +(p0) fnorm atan_NORM_f8 = f8 +} +;; + + +// qnan snan inf norm unorm 0 -+ +// 1 1 0 0 0 1 11 +// c 7 + +// p9 set if we have a NAN or +-0 + +{ .mmf +(p0) ldfe atan_Q8 = [atan_GR_AD_Q],16 +(p0) ldfe atan_P10 = [atan_GR_AD_P],16 +(p0) fclass.m.unc p9, p0 = f8, 0xc7 +} +;; + + +{ .mmi +(p0) ldfe atan_Q9 = [atan_GR_AD_Q],16 +(p0) ldfe atan_P11 = [atan_GR_AD_P],16 + nop.i 999 +} +;; + + +{ .mmf +(p0) ldfe atan_Q4 = [atan_GR_AD_Q],16 +(p0) ldfe atan_P20 = [atan_GR_AD_P],16 +(p9) fma.d.s0 f8 = f8,f1,f0 +;; +} + +// Exit if we have a NAN or +-0 +{ .mmb +(p0) ldfe atan_Q5 = [atan_GR_AD_Q],16 +(p0) ldfe atan_P21 = [atan_GR_AD_P],16 +(p9) br.ret.spnt b0 +;; +} + + +// p6 is TRUE if |x| <= 1 +// p7 is TRUE if |x| > 1 +{ .mmf +(p0) ldfe atan_Q6 = [atan_GR_AD_Q],16 +(p0) ldfe atan_P8 = [atan_GR_AD_P],16 +(p0) fcmp.le.unc p6,p7 = atan_ABS_f8, f1 +;; +} + + +{ .mfi +(p0) ldfe atan_Q7 = [atan_GR_AD_Q],16 +(p0) fma.s1 atan_Z = atan_C, atan_C, f0 + nop.i 999 +} +{ .mfi +(p0) ldfe atan_P9 = [atan_GR_AD_P],16 +(p0) fnma.s1 atan_B = atan_C,f8, f1 + nop.i 999 ;; +} + + +{ .mfi +(p0) ldfe atan_Q21 = [atan_GR_AD_Q],16 +(p0) fma.s1 atan_V12 = atan_Y, atan_Y, f0 + nop.i 999 +} +{ .mfi +(p0) ldfe atan_P4 = [atan_GR_AD_P],16 +(p0) fma.s1 atan_Xcub = f8, atan_Y , f0 + nop.i 999 +;; +} + + +{ .mmi +(p7) ldfe atan_Q22 = [atan_GR_AD_Q],16 +(p6) ldfe atan_P5 = [atan_GR_AD_P],16 +(p6) cmp.eq.unc p8,p0 = r0,r0 +;; +} + + +{ .mmi +(p7) ldfe atan_Q19 = [atan_GR_AD_Q],16 +(p6) ldfe atan_P6 = [atan_GR_AD_P],16 +(p7) cmp.eq.unc p9,p0 = r0,r0 +;; +} + + +{ .mmi +(p7) ldfe atan_Q20 = [atan_GR_AD_Q],16 +(p6) ldfe atan_P7 = [atan_GR_AD_P],16 + nop.i 999 +;; +} + +{ .mfi +(p7) ldfe atan_Q0 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_V13 = atan_Y, atan_P11, atan_P10 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P16 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V11 = atan_Y, atan_Q9, atan_Q8 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_Q1 = [atan_GR_AD_Q],16 +(p7) fma.s1 atan_G12 = atan_B, atan_B, f0 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P17 = [atan_GR_AD_P],16 +(p0) fma.s1 atan_V9 = atan_V12, atan_V12, f0 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_Q2 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_W11 = atan_Y, atan_P21, atan_P20 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P18 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V7 = atan_Y, atan_Q5, atan_Q4 + nop.i 999 ;; +} + +{ .mfi +(p7) ldfe atan_Q3 = [atan_GR_AD_Q],16 +(p7) fma.s1 atan_Z1 = atan_Z, atan_Z, f0 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P19 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_Y3 = atan_Y , atan_V12, f0 + nop.i 999 ;; +} + +{ .mfi +(p7) ldfe atan_R8 = [atan_GR_AD_R],16 +(p6) fma.s1 atan_V11 = atan_Y, atan_P9, atan_P8 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P12 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V8 = atan_Y, atan_Q7, atan_Q6 + nop.i 999 ;; +} + +{ .mmi +(p7) ldfe atan_R9 = [atan_GR_AD_R],16 +(p6) ldfe atan_P13 = [atan_GR_AD_P],16 + nop.i 999 +;; +} + +{ .mfi +(p7) ldfe atan_R4 = [atan_GR_AD_R],16 +(p6) fma.s1 atan_V7 = atan_Y, atan_P5, atan_P4 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P14 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_W13 = atan_Y, atan_Q22, atan_Q21 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_R5 = [atan_GR_AD_R],16 +(p6) fma.s1 atan_Y12 = atan_V9 , atan_V9 , f0 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P15 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_Y8 = atan_V9 , atan_V9 , f0 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_R6 = [atan_GR_AD_R],16 +(p6) fma.s1 atan_V8 = atan_Y, atan_P7, atan_P6 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P0 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_W11 = atan_Y, atan_Q20, atan_Q19 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_R7 = [atan_GR_AD_R],16 +(p7) fma.s1 atan_Z2 = atan_Z1 , atan_Z1, f0 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P1 = [atan_GR_AD_P],16 +(p6) fma.s1 atan_V10 = atan_V12, atan_V13, atan_V11 + nop.i 999 ;; +} + +{ .mfi +(p7) ldfe atan_Q15 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_W7 = atan_Y, atan_P17, atan_P16 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P2 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V3 = atan_Y, atan_Q1 , atan_Q0 + nop.i 999 ;; +} + +{ .mfi +(p7) ldfe atan_Q16 = [atan_GR_AD_Q],16 +(p7) fma.s1 atan_G9 = atan_G12, atan_G12, f0 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P3 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_R1 = [atan_GR_AD_R],16 +(p6) fma.s1 atan_W8 = atan_Y, atan_P19, atan_P18 + nop.i 999 +} +{ .mfi +(p6) ldfe atan_P22 = [atan_GR_AD_P],16 +(p7) fma.s1 atan_V4 = atan_Y, atan_Q3 , atan_Q2 + nop.i 999 ;; +} + + +{ .mfi + getf.exp atan_GR_signexp_f8 = atan_NORM_f8 +(p7) fma.s1 atan_Y11 = atan_Y3 , atan_Y8 , f0 + nop.i 999 +} +{ .mfi +(p7) ldfe atan_Q17 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_Q18 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_W3 = atan_Y, atan_P13, atan_P12 + nop.i 999 +} +{ .mfi +(p7) ldfe atan_R10 = [atan_GR_AD_R],16 +(p7) fma.s1 atan_G11 = atan_B, atan_R9 , atan_R8 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_Q10 = [atan_GR_AD_Q],16 +(p7) fma.s1 atan_Z3 = atan_Z1 , atan_Z2 , f0 +(p0) and atan_GR_exp_f8 = atan_GR_signexp_f8,atan_GR_exp_mask +} +{ .mfi +(p7) ldfe atan_R2 = [atan_GR_AD_R],16 +(p7) fma.s1 atan_Z4 = atan_Z2 , atan_Z2 , f0 + nop.i 999 ;; +} + + +{ .mfi +(p7) ldfe atan_Q11 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_W4 = atan_Y, atan_P15, atan_P14 + nop.i 999 +} +{ .mfi +(p7) ldfe atan_R3 = [atan_GR_AD_R],16 +(p7) fma.s1 atan_G7 = atan_B, atan_R5 , atan_R4 +(p0) cmp.le.unc p11,p0 = atan_GR_10172,atan_GR_exp_f8 +;; +} + + +{ .mmf +(p9) ldfe atan_Q12 = [atan_GR_AD_Q],16 +(p0) ldfe atan_S_PI = [atan_GR_AD_R],16 +(p8) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7 +;; +} + + + +{ .mfi +(p9) ldfe atan_Q13 = [atan_GR_AD_Q],16 +(p8) fma.s1 atan_V3 = atan_Y, atan_P1 , atan_P0 +(p11) cmp.ne.and p6,p7 = r0,r0 +} +{ .mfi + nop.m 999 +(p8) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6 + nop.i 999 ;; +} + + +.pred.rel "mutex",p6,p7,p11 +{ .mfi +(p7) ldfe atan_Q14 = [atan_GR_AD_Q],16 +(p6) fma.s1 atan_Y12 = atan_V9 , atan_Y12, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G8 = atan_B, atan_R7 , atan_R6 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_V4 = atan_Y, atan_P3 , atan_P2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W7 = atan_Y, atan_Q16, atan_Q15 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_W10 = atan_V12, atan_P22, atan_W11 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G3 = atan_B, atan_R1 , f1 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_V2 = atan_V12, atan_V4 , atan_V3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W8 = atan_Y, atan_Q18, atan_Q17 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G10 = atan_G12, atan_R10, atan_G11 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_V10 = atan_V12, atan_Q10, atan_V11 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G6 = atan_G12, atan_G8 , atan_G7 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_V2 = atan_V12, atan_V4, atan_V3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G4 = atan_B , atan_R3 , atan_R2 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W3 = atan_Y , atan_Q12, atan_Q11 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_Z5 = atan_Z3 , atan_Z4 , f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W10 = atan_V12, atan_W13, atan_W11 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W4 = atan_Y , atan_Q14, atan_Q13 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G5 = atan_G9 , atan_G10, atan_G6 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G2 = atan_G12, atan_G4 , atan_G3 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_Z6 = atan_Z4 , atan_C , f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.s atan_S_PI = f8, atan_S_PI + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_G1 = atan_G9 , atan_G5 , atan_G2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.s1 atan_P = atan_Y12, atan_W1 , atan_V1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_Z7 = atan_Z5 , atan_Z6 , f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p11) fma.d.s0 f8 = atan_S_PI,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 atan_Z = atan_G1 , atan_Z7 , f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p7) fma.s1 atan_Q = atan_Y11, atan_W1 , atan_V1 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p6) fma.d.s0 f8 = atan_P , atan_Xcub , f8 + nop.i 999 +} +{ .mfb + nop.m 999 +(p7) fnma.d.s0 f8 = atan_Z , atan_Q , atan_S_PI +(p0) br.ret.sptk b0 ;; +} + +.endp atan +ASM_SIZE_DIRECTIVE(atan) diff --git a/sysdeps/ia64/fpu/s_atanf.S b/sysdeps/ia64/fpu/s_atanf.S new file mode 100644 index 0000000000..8edd5d45a9 --- /dev/null +++ b/sysdeps/ia64/fpu/s_atanf.S @@ -0,0 +1,543 @@ +.file "atanf.s" + +// THIS IS NOT OPTIMIZED AND NOT OFFICIAL + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + + +// History +//============================================================== +// ?/??/00 Initial revision +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. + +#include "libm_support.h" + +// +// Assembly macros +//============================================================== + +// integer registers used +EXP_Addr1 = r33 +EXP_Addr2 = r34 + +// floating point registers used +atanf_coeff_R4 = f32 +atanf_coeff_R5 = f33 +atanf_coeff_R1 = f34 +atanf_coeff_R2 = f35 + +atanf_coeff_R3 = f36 +atanf_coeff_P1 = f37 +atanf_coeff_Q6 = f38 +atanf_coeff_Q7 = f39 +atanf_coeff_Q8 = f40 + +atanf_coeff_Q9 = f41 +atanf_coeff_Q4 = f42 +atanf_coeff_Q5 = f43 +atanf_coeff_Q2 = f44 +atanf_coeff_Q3 = f45 + +atanf_coeff_P5 = f46 +atanf_coeff_P6 = f47 +atanf_coeff_Q0 = f48 +atanf_coeff_Q1 = f49 +atanf_coeff_P7 = f50 + +atanf_coeff_P8 = f51 +atanf_coeff_P3 = f52 +atanf_coeff_P4 = f53 +atanf_coeff_P9 = f54 +atanf_coeff_P10 = f55 + +atanf_coeff_P2 = f56 +atanf_piby2 = f57 +atanf_z = f58 +atanf_b = f59 +atanf_zsq = f60 + +atanf_sgn_x = f61 +atanf_sgnx_piby2 = f62 +atanf_abs_x = f63 +atanf_t = f64 +atanf_xcub = f65 + +atanf_tsq = f66 +atanf_t4 = f67 +atanf_x5 = f68 +atanf_x6 = f69 +atanf_x11 = f70 + +atanf_poly_p1 = f71 +atanf_poly_p2 = f72 +atanf_poly_p3 = f73 +atanf_poly_p4 = f74 +atanf_poly_p5 = f75 + +atanf_poly_q1 = f76 +atanf_poly_q2 = f77 +atanf_poly_q3 = f78 +atanf_poly_q4 = f79 +atanf_poly_q5 = f80 + +atanf_poly_q = f81 +atanf_poly_r1 = f81 +atanf_poly_r2 = f82 +atanf_poly_r3 = f83 +atanf_bsq = f84 +atanf_z4 = f85 + +atanf_z5 = f86 +atanf_z8 = f87 +atanf_z13 = f88 +atanf_poly_r2 = f89 +atanf_poly_r1 = f90 + +atanf_z8_bsq = f91 +atanf_poly_r = f92 +atanf_z21_poly_r = f93 +atanf_answer = f8 + + +// predicate registers used +//atanf_pred_LE1 = p6 +//atanf_pred_GT1 = p7 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +atanf_coeff_1_table: +ASM_TYPE_DIRECTIVE(atanf_coeff_1_table,@object) +data8 0x40c4c241be751ff2 // r4 +data8 0x40e9f300c2f3070b // r5 +data8 0x409babffef772075 // r3 +data8 0xbfd5555512191621 // p1 +data8 0x3fc9997e7afbff4e // p2 = q8 +data8 0xbfd5555512191621 // p1 = q9 +data8 0x3f97105b4160f86b // p8 = q2 +data8 0xbfa6e10ba401393f // p7 = q3 +data8 0x3f522e5d33bc9baa // p10 = q0 +data8 0xbf7deaadaa336451 // p9 = q1 +data8 0xbfc2473c5145ee38 // p3 +data8 0x3fbc4f512b1865f5 // p4 +data8 0x3fc9997e7afbff4e // p2 +data8 0x3ff921fb54442d18 // pi/2 +ASM_SIZE_DIRECTIVE(atanf_coeff_1_table) + + + +atanf_coeff_2_table: +ASM_TYPE_DIRECTIVE(atanf_coeff_2_table,@object) +data8 0x4035000000004284 // r1 +data8 0x406cdffff336a59b // r2 +data8 0x3fbc4f512b1865f5 // p4 = q6 +data8 0xbfc2473c5145ee38 // p3 = q7 +data8 0x3fb142a73d7c54e3 // p6 = q4 +data8 0xbfb68eed6a8cfa32 // p5 = q5 +data8 0xbfb68eed6a8cfa32 // p5 +data8 0x3fb142a73d7c54e3 // p6 +data8 0xbfa6e10ba401393f // p7 +data8 0x3f97105b4160f86b // p8 +data8 0xbf7deaadaa336451 // p9 +data8 0x3f522e5d33bc9baa // p10 +ASM_SIZE_DIRECTIVE(atanf_coeff_2_table) + + + +.global atanf + +.text +.proc atanf + +.align 32 +atanf: + + +{ .mfi + alloc r32 = ar.pfs,1,2,0,0 + frcpa.s1 atanf_z,p0 = f1,f8 + addl EXP_Addr2 = @ltoff(atanf_coeff_2_table),gp +} +{ .mfi + addl EXP_Addr1 = @ltoff(atanf_coeff_1_table),gp + fma.s1 atanf_t = f8,f8,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fmerge.s atanf_sgn_x = f8,f1 + nop.i 999;; +} + +{ .mfi + ld8 EXP_Addr1 = [EXP_Addr1] + fmerge.s atanf_abs_x = f1,f8 + nop.i 999 +} +{ .mfi + ld8 EXP_Addr2 = [EXP_Addr2] + nop.f 999 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fclass.m p8,p0 = f8,0x7 // @zero + nop.i 999;; +} + +{ .mfi + nop.m 999 + fcmp.eq.unc.s0 p9,p10 = f8,f1 + nop.i 999;; +} + +{ .mfi + ldfpd atanf_coeff_R4,atanf_coeff_R5 = [EXP_Addr1],16 + fnma.s1 atanf_b = f8,atanf_z,f1 + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_R1,atanf_coeff_R2 = [EXP_Addr2],16 + fma.s1 atanf_zsq = atanf_z,atanf_z,f0 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_R3,atanf_coeff_P1 = [EXP_Addr1],16 + fma.s1 atanf_xcub = f8,atanf_t,f0 + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_Q6,atanf_coeff_Q7 = [EXP_Addr2],16 + fma.s1 atanf_tsq = atanf_t,atanf_t,f0 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_Q8,atanf_coeff_Q9 = [EXP_Addr1],16 +// fcmp.le.s1 atanf_pred_LE1,atanf_pred_GT1 = atanf_abs_x,f1 + fcmp.le.s1 p6,p7 = atanf_abs_x,f1 + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_Q4,atanf_coeff_Q5 = [EXP_Addr2],16 + nop.f 999 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_Q2,atanf_coeff_Q3 = [EXP_Addr1],16 + fclass.m p8,p0 = f8,0xe7 // @inf|@qnan|@snan|@zero + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_P5,atanf_coeff_P6 = [EXP_Addr2],16 + nop.f 999 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_Q0,atanf_coeff_Q1 = [EXP_Addr1],16 + nop.f 999 + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_P7,atanf_coeff_P8 = [EXP_Addr2],16 + nop.f 999 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_P3,atanf_coeff_P4 = [EXP_Addr1],16 + fma.s1 atanf_bsq = atanf_b,atanf_b,f0 + nop.i 999 +} +{ .mfi + ldfpd atanf_coeff_P9,atanf_coeff_P10 = [EXP_Addr2] + fma.s1 atanf_z4 = atanf_zsq,atanf_zsq,f0 + nop.i 999;; +} + + +{ .mfi + ldfpd atanf_coeff_P2,atanf_piby2 = [EXP_Addr1] + fma.s1 atanf_x6 = atanf_t,atanf_tsq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_t4 = atanf_tsq,atanf_tsq,f0 + nop.i 999;; +} + + +{ .mfb + nop.m 999 + fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0 +(p8) br.cond.spnt L(ATANF_X_INF_NAN_ZERO) +} +;; + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r1 = atanf_b,atanf_coeff_R1,f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r3 = atanf_b,atanf_coeff_R5,atanf_coeff_R4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r2 = atanf_b,atanf_coeff_R3,atanf_coeff_R2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_z8 = atanf_z4,atanf_z4,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q2 = atanf_t,atanf_coeff_Q5,atanf_coeff_Q4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q3 = atanf_t,atanf_coeff_Q7,atanf_coeff_Q6 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_z5 = atanf_z,atanf_z4,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q1 = atanf_t,atanf_coeff_Q9,atanf_coeff_Q8 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q4 = atanf_t,atanf_coeff_Q1,atanf_coeff_Q0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q5 = atanf_t,atanf_coeff_Q3,atanf_coeff_Q2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p4 = f8,atanf_coeff_P1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p5 = atanf_t,atanf_coeff_P4,atanf_coeff_P3 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r1 = atanf_z8,atanf_poly_r1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_z8_bsq = atanf_z8,atanf_bsq,f0 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q2 = atanf_tsq,atanf_poly_q3,atanf_poly_q2 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r2 = atanf_bsq,atanf_poly_r3,atanf_poly_r2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p2 = atanf_t,atanf_coeff_P8,atanf_coeff_P7 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q1 = atanf_poly_q1,f1,atanf_tsq + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_z13 = atanf_z5,atanf_z8,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p1 = atanf_t,atanf_coeff_P10,atanf_coeff_P9 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p4 = atanf_t,atanf_poly_p4,f8 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q4 = atanf_tsq,atanf_poly_q5,atanf_poly_q4 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p3 = atanf_t,atanf_coeff_P6,atanf_coeff_P5 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p5 = atanf_t,atanf_poly_p5,atanf_coeff_P2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_x11 = atanf_x5,atanf_x6,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_r = atanf_z8_bsq,atanf_poly_r2,atanf_poly_r1 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q2 = atanf_t4,atanf_poly_q1,atanf_poly_q2 + nop.i 999;; +} + + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p2 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p4 = atanf_x5,atanf_poly_p5,atanf_poly_p4 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 atanf_z21_poly_r = atanf_z13,atanf_poly_r,f0 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_q = atanf_t4,atanf_poly_q2,atanf_poly_q4 + nop.i 999;; +} + +{ .mfi + nop.m 999 + fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p3 + nop.i 999;; +} + +{ .mfi + nop.m 999 +//(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2 +(p7) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2 + nop.i 999;; +} + +{ .mfb + nop.m 999 +//(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4 +(p6) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4 + br.ret.sptk b0 +} + + + +L(ATANF_X_INF_NAN_ZERO): + + fclass.m p8,p9 = f8,0x23 // @inf +;; +(p8) fmerge.s f8 = f8, atanf_piby2 +;; + fnorm.s f8 = f8 + br.ret.sptk b0 + +.endp atanf +ASM_SIZE_DIRECTIVE(atanf) diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S new file mode 100644 index 0000000000..0192ac6a18 --- /dev/null +++ b/sysdeps/ia64/fpu/s_atanl.S @@ -0,0 +1,1994 @@ +.file "atanl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// +// ********************************************************************* +// +// History +// 2/02/00 (hand-optimized) +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: atanl(x) = inverse tangent(x), for double extended x values +// Function: atan2l(y,x) = atan(y/x), for double extended x values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9-f15 +// f32-f79 +// +// General Purpose Registers: +// r32-r48 +// r49,r50,r51,r52 (Arguments to error support for 0,0 case) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Underflow exceptions may occur +// Special error handling for the y=0 and x=0 case +// Inexact raised when appropriate by algorithm +// +// atanl(SNaN) = QNaN +// atanl(QNaN) = QNaN +// atanl(+/-0) = +/- 0 +// atanl(+/-Inf) = +/-pi/2 +// +// atan2l(Any NaN for x or y) = QNaN +// atan2l(+/-0,x) = +/-0 for x > 0 +// atan2l(+/-0,x) = +/-pi for x < 0 +// atan2l(+/-0,+0) = +/-0 +// atan2l(+/-0,-0) = +/-pi +// atan2l(y,+/-0) = pi/2 y > 0 +// atan2l(y,+/-0) = -pi/2 y < 0 +// atan2l(+/-y, Inf) = +/-0 for finite y > 0 +// atan2l(+/-Inf, x) = +/-pi/2 for finite x +// atan2l(+/-y, -Inf) = +/-pi for finite y > 0 +// atan2l(+/-Inf, Inf) = +/-pi/4 +// atan2l(+/-Inf, -Inf) = +/-3pi/4 +// +// ********************************************************************* +// +// Mathematical Description +// --------------------------- +// +// The function ATANL( Arg_Y, Arg_X ) returns the "argument" +// or the "phase" of the complex number +// +// Arg_X + i Arg_Y +// +// or equivalently, the angle in radians from the positive +// x-axis to the line joining the origin and the point +// (Arg_X,Arg_Y) +// +// +// (Arg_X, Arg_Y) x +// \ +// \ +// \ +// \ +// \ angle between is ATANL(Arg_Y,Arg_X) + + + + +// \ +// ------------------> X-axis + +// Origin +// +// Moreover, this angle is reported in the range [-pi,pi] thus +// +// -pi <= ATANL( Arg_Y, Arg_X ) <= pi. +// +// From the geometry, it is easy to define ATANL when one of +// Arg_X or Arg_Y is +-0 or +-inf: +// +// +// \ Y | +// X \ | +0 | -0 | +inf | -inf | finite non-zero +// \ | | | | | +// ______________________________________________________ +// | | | | +// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 +// | qNaN | | | +// -------------------------------------------------------- +// | | | | | +// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 +// -------------------------------------------------------- +// | | | | | +// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi +// -------------------------------------------------------- +// finite | X>0? | pi/2 | -pi/2 | normal case +// non-zero| sign(Y)*0: | | | +// | sign(Y)*pi | | | +// +// +// One must take note that ATANL is NOT the arctangent of the +// value Arg_Y/Arg_X; but rather ATANL and arctan are related +// in a slightly more complicated way as follows: +// +// Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|); +// sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1; +// s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X; +// +// sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1; +// s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y; +// +// swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise. +// +// Then, ATANL(Arg_Y, Arg_X) = +// +// / arctan(V/U) \ sign_X = 0 & swap = 0 +// | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1 +// s_Y * | | +// | pi - arctan(V/U) | sign_X = 1 & swap = 0 +// \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1 +// +// +// This relationship also suggest that the algorithm's major +// task is to calculate arctan(V/U) for 0 < V <= U; and the +// final Result is given by +// +// s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) } +// +// where +// +// (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately +// +// M(sign_X,swap) = 0 for sign_X = 0 and swap = 0 +// 1 for swap = 1 +// 2 for sign_X = 1 and swap = 0 +// +// and +// +// sigma = { (sign_X XOR swap) : -1.0 : 1.0 } +// +// = (-1) ^ ( sign_X XOR swap ) +// +// Both (P_hi,P_lo) and sigma can be stored in a table and fetched +// using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a +// double-precision, and single-precision pair; and sigma can +// obviously be just a single-precision number. +// +// In the algorithm we propose, arctan(V/U) is calculated to high accuracy +// as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is +// given by +// +// s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) +// +// We now discuss the calculation of arctan(V/U) for 0 < V <= U. +// +// For (V/U) < 2^(-3), we use a simple polynomial of the form +// +// z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8))) +// +// where z = V/U. +// +// For the sake of accuracy, the first term "z" must approximate V/U to +// extra precision. For z^3 and higher power, a working precision +// approximation to V/U suffices. Thus, we obtain: +// +// z_hi + z_lo = V/U to extra precision and +// z = V/U to working precision +// +// The value arctan(V/U) is delivered as two pieces (A_hi, A_lo) +// +// (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo). +// +// +// For 2^(-3) <= (V/U) <= 1, we use a table-driven approach. +// Consider +// +// (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 .... +// +// Define +// +// z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1 +// +// then +// / \ +// | (V/U) - z_hi | + +// arctan(V/U) = arctan(z_hi) + acrtan| -------------- | +// | 1 + (V/U)*z_hi | +// \ / +// +// / \ +// | V - z_hi*U | + +// = arctan(z_hi) + acrtan| -------------- | +// | U + V*z_hi | +// \ / +// +// = arctan(z_hi) + acrtan( V' / U' ) +// +// +// where +// +// V' = V - U*z_hi; U' = U + V*z_hi. +// +// Let +// +// w_hi + w_lo = V'/U' to extra precision and +// w = V'/U' to working precision +// +// then we can approximate arctan(V'/U') by +// +// arctan(V'/U') = w_hi + w_lo +// + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4))) +// +// = w_hi + w_lo + poly +// +// Finally, arctan(z_hi) is calculated beforehand and stored in a table +// as Tbl_hi, Tbl_lo. Thus, +// +// (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo))) +// +// This completes the mathematical description. +// +// +// Algorithm +// ------------- +// +// Step 0. Check for unsupported format. +// +// If +// ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR +// ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 ) +// +// then one of the arguments is unsupported. Generate an +// invalid and return qNaN. +// +// Step 1. Initialize +// +// Normalize Arg_X and Arg_Y and set the following +// +// sign_X := sign_bit(Arg_X) +// s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0) +// swap := (|Arg_X| >= |Arg_Y|? 0 : 1 ) +// U := max( |Arg_X|, |Arg_Y| ) +// V := min( |Arg_X|, |Arg_Y| ) +// +// execute: frcap E, pred, V, U +// If pred is 0, go to Step 5 for special cases handling. +// +// Step 2. Decide on branch. +// +// Q := E * V +// If Q < 2^(-3) go to Step 4 for simple polynomial case. +// +// Step 3. Table-driven algorithm. +// +// Q is represented as +// +// 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3 +// +// and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0. +// +// Define +// +// z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1 +// +// (note that there are 49 possible values of z_hi). +// +// ...We now calculate V' and U'. While V' is representable +// ...as a 64-bit number because of cancellation, U' is +// ...not in general a 64-bit number. Obtaining U' accurately +// ...requires two working precision numbers +// +// U_prime_hi := U + V * z_hi ...WP approx. to U' +// U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order +// V_prime := V - U * z_hi ...this is exact +// +// C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi +// +// loop 3 times +// C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi) +// +// ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits +// +// w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to +// ...roughly working precision +// +// ...note that we want w_hi + w_lo to approximate +// ...V_prime/(U_prime_hi + U_prime_lo) to extra precision +// ...but for now, w_hi is good enough for the polynomial +// ...calculation. +// +// wsq := w_hi*w_hi +// poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4))) +// +// Fetch +// (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4) +// ...Tbl_hi is a double-precision number +// ...Tbl_lo is a single-precision number +// +// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) +// ...as discussed previous. Again; the implementation can +// ...chose to fetch P_hi and P_lo from a table indexed by +// ...(sign_X, swap). +// ...P_hi is a double-precision number; +// ...P_lo is a single-precision number. +// +// ...calculate w_lo so that w_hi + w_lo is V'/U' accurately +// w_lo := ((V_prime - w_hi*U_prime_hi) - +// w_hi*U_prime_lo) * C_hi ...observe order +// +// +// ...Ready to deliver arctan(V'/U') as A_hi, A_lo +// A_hi := Tbl_hi +// A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order +// +// ...Deliver final Result +// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) +// +// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) +// ...sigma can be obtained by a table lookup using +// ...(sign_X,swap) as index and stored as single precision +// ...sigma should be calculated earlier +// +// P_hi := s_Y*P_hi +// A_hi := s_Y*A_hi +// +// Res_hi := P_hi + sigma*A_hi ...this is exact because +// ...both P_hi and Tbl_hi +// ...are double-precision +// ...and |Tbl_hi| > 2^(-4) +// ...P_hi is either 0 or +// ...between (1,4) +// +// Res_lo := sigma*A_lo + P_lo +// +// Return Res_hi + s_Y*Res_lo in user-defined rounding control +// +// Step 4. Simple polynomial case. +// +// ...E and Q are inherited from Step 2. +// +// A_hi := Q ...Q is inherited from Step 2 Q approx V/U +// +// loop 3 times +// E := E + E2(1.0 - E*U1 +// ...at this point E approximates 1/U to roughly working precision +// +// z := V * E ...z approximates V/U to roughly working precision +// zsq := z * z +// z8 := zsq * zsq; z8 := z8 * z8 +// +// poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) +// poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3)) +// +// poly := poly1 + z8*poly2 +// +// z_lo := (V - A_hi*U)*E +// +// A_lo := z*poly + z_lo +// ...A_hi, A_lo approximate arctan(V/U) accurately +// +// (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo) +// ...one can store the M(sign_X,swap) as single precision +// ...values +// +// ...Deliver final Result +// ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo) +// +// sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 ) +// ...sigma can be obtained by a table lookup using +// ...(sign_X,swap) as index and stored as single precision +// ...sigma should be calculated earlier +// +// P_hi := s_Y*P_hi +// A_hi := s_Y*A_hi +// +// Res_hi := P_hi + sigma*A_hi ...need to compute +// ...P_hi + sigma*A_hi +// ...exactly +// +// tmp := (P_hi - Res_hi) + sigma*A_hi +// +// Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp +// +// Return Res_hi + Res_lo in user-defined rounding control +// +// Step 5. Special Cases +// +// If pred is 0 where pred is obtained in +// frcap E, pred, V, U +// +// we are in one of those special cases of 0,+-inf or NaN +// +// If one of U and V is NaN, return U+V (which will generate +// invalid in case one is a signaling NaN). Otherwise, +// return the Result as described in the table +// +// +// +// \ Y | +// X \ | +0 | -0 | +inf | -inf | finite non-zero +// \ | | | | | +// ______________________________________________________ +// | | | | +// +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2 +// | qNaN | | | +// -------------------------------------------------------- +// | | | | | +// +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0 +// -------------------------------------------------------- +// | | | | | +// -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi +// -------------------------------------------------------- +// finite | X>0? | pi/2 | -pi/2 | +// non-zero| sign(Y)*0: | | | N/A +// | sign(Y)*pi | | | +// +// + +#include "libm_support.h" + +ArgY_orig = f8 +Result = f8 +FR_RESULT = f8 +ArgX_orig = f9 +ArgX = f10 +FR_X = f10 +ArgY = f11 +FR_Y = f11 +s_Y = f12 +U = f13 +V = f14 +E = f15 +Q = f32 +z_hi = f33 +U_prime_hi = f34 +U_prime_lo = f35 +V_prime = f36 +C_hi = f37 +w_hi = f38 +w_lo = f39 +wsq = f40 +poly = f41 +Tbl_hi = f42 +Tbl_lo = f43 +P_hi = f44 +P_lo = f45 +A_hi = f46 +A_lo = f47 +sigma = f48 +Res_hi = f49 +Res_lo = f50 +Z = f52 +zsq = f53 +z8 = f54 +poly1 = f55 +poly2 = f56 +z_lo = f57 +tmp = f58 +P_1 = f59 +Q_1 = f60 +P_2 = f61 +Q_2 = f62 +P_3 = f63 +Q_3 = f64 +P_4 = f65 +Q_4 = f66 +P_5 = f67 +P_6 = f68 +P_7 = f69 +P_8 = f70 +TWO_TO_NEG3 = f71 +U_hold = f72 +C_hi_hold = f73 +E_hold = f74 +M = f75 +ArgX_abs = f76 +ArgY_abs = f77 +Result_lo = f78 +A_temp = f79 +GR_SAVE_PFS = r33 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +sign_X = r36 +sign_Y = r37 +swap = r38 +table_ptr1 = r39 +table_ptr2 = r40 +k = r41 +lookup = r42 +exp_ArgX = r43 +exp_ArgY = r44 +exponent_Q = r45 +significand_Q = r46 +special = r47 +special1 = r48 +GR_Parameter_X = r49 +GR_Parameter_Y = r50 +GR_Parameter_RESULT = r51 +GR_Parameter_TAG = r52 +int_temp = r52 + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 64 + +Constants_atan: +ASM_TYPE_DIRECTIVE(Constants_atan,@object) +data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000 +// double pi/2, single lo_pi/2, two**(-3) +data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1 +data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2 +data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3 +data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4 +data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5 +data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6 +data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7 +data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8 +data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1 +data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2 +data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3 +data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4 +// +// Entries Tbl_hi (double precision) +// B = 1+Index/16+1/32 Index = 0 +// Entries Tbl_lo (single precision) +// B = 1+Index/16+1/32 Index = 0 +// +data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000 +// +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-1)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) +// +data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000 +data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000 +data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000 +data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000 +data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000 +data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000 +data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000 +data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000 +data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000 +data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000 +data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000 +data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000 +data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000 +data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000 +data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000 +data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000 +// +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-2)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) +// +data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000 +data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000 +data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000 +data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000 +data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000 +data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000 +data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000 +data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000 +data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000 +data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000 +data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000 +data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000 +data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000 +data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000 +data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000 +data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000 +// +// Entries Tbl_hi (double precision) Index = 0,1,...,15 +// B = 2^(-3)*(1+Index/16+1/32) +// Entries Tbl_lo (single precision) +// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) +// +data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000 +data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000 +data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000 +data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000 +data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000 +data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000 +data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000 +data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000 +data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000 +data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000 +data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000 +data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000 +data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000 +data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000 +data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000 +data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000 + +data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles +data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles +data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles +data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles +ASM_SIZE_DIRECTIVE(Constants_atan) + + +.text +.proc atanl# +.global atanl# +.align 64 + +atanl: +{ .mfb + nop.m 999 +(p0) mov ArgX_orig = f1 +(p0) br.cond.sptk atan2l ;; +} +.endp atanl +ASM_SIZE_DIRECTIVE(atanl) + +.text +.proc atan2l# +.global atan2l# +#ifdef _LIBC +.proc __atan2l# +.global __atan2l# +.proc __ieee754_atan2l# +.global __ieee754_atan2l# +#endif +.align 64 + + +atan2l: +#ifdef _LIBC +__atan2l: +__ieee754_atan2l: +#endif +{ .mfi +alloc r32 = ar.pfs, 0, 17 , 4, 0 +(p0) mov ArgY = ArgY_orig +} +{ .mfi + nop.m 999 +(p0) mov ArgX = ArgX_orig + nop.i 999 +};; +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// +// Save original input args and load table ptr. +// +(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103 + nop.i 999 +};; +{ .mfi +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp +(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF + nop.i 999 ;; +} +{ .mfi + ld8 table_ptr1 = [table_ptr1] +(p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3 + nop.i 999 ;; +} +{ .mfi +(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3 + nop.i 999 +} + + +// +// Check for NatVals. +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan (IA unsupporteds). +// +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(ATANL_NATVAL) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.spnt L(ATANL_NATVAL) ;; +} +{ .mib +(p0) ldfd P_hi = [table_ptr1],8 + nop.i 999 +(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;; +} +{ .mbb +(p0) add table_ptr2 = 96, table_ptr1 +(p9) br.cond.spnt L(ATANL_UNSUPPORTED) +// +// Load double precision high-order part of pi +// +(p12) br.cond.spnt L(ATANL_NAN) ;; +} +{ .mfb + nop.m 999 +(p0) fnorm.s1 ArgX = ArgX +(p13) br.cond.spnt L(ATANL_NAN) ;; +} +// +// Normalize the input argument. +// Branch out if NaN inputs +// +{ .mmf +(p0) ldfs P_lo = [table_ptr1], 4 + nop.m 999 +(p0) fnorm.s1 ArgY = ArgY ;; +} +{ .mmf + nop.m 999 +(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180 +// +// U = max(ArgX_abs,ArgY_abs) +// V = min(ArgX_abs,ArgY_abs) +// if PR1, swap = 0 +// if PR2, swap = 1 +// +(p0) mov M = f1 ;; +} +{ .mfi + nop.m 999 +// +// Get exp and sign of ArgX +// Get exp and sign of ArgY +// Load 2**(-3) and increment ptr to Q_4. +// +(p0) fmerge.s ArgX_abs = f1, ArgX + nop.i 999 ;; +} +// +// load single precision low-order part of pi = P_lo +// +{ .mfi +(p0) getf.exp sign_X = ArgX +(p0) fmerge.s ArgY_abs = f1, ArgY + nop.i 999 ;; +} +{ .mii +(p0) getf.exp sign_Y = ArgY + nop.i 999 ;; +(p0) shr sign_X = sign_X, 17 ;; +} +{ .mii + nop.m 999 +(p0) shr sign_Y = sign_Y, 17 ;; +(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;; +} +{ .mfi + nop.m 999 +// +// Is ArgX_abs >= ArgY_abs +// Is sign_Y == 0? +// +(p0) fmax.s1 U = ArgX_abs, ArgY_abs + nop.i 999 +} +{ .mfi + nop.m 999 +// +// ArgX_abs = |ArgX| +// ArgY_abs = |ArgY| +// sign_X is sign bit of ArgX +// sign_Y is sign bit of ArgY +// +(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmin.s1 V = ArgX_abs, ArgY_abs + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fadd.s1 s_Y = f0, f1 +(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X +} +{ .mii +(p6) add swap = r0, r0 + nop.i 999 ;; +(p7) add swap = 1, r0 +} +{ .mfi + nop.m 999 +// +// Let M = 1.0 +// if p8, s_Y = 1.0 +// if p9, s_Y = -1.0 +// +(p10) fsub.s1 M = M, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fsub.s1 s_Y = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) frcpa.s1 E, p6 = V, U + nop.i 999 ;; +} +{ .mbb + nop.m 999 +// +// E = frcpa(V,U) +// +(p6) br.cond.sptk L(ATANL_STEP2) +(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;; +} +L(ATANL_STEP2): +{ .mfi + nop.m 999 +(p0) fmpy.s1 Q = E, V + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Is Q < 2**(-3)? +// +(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fadd.s1 M = M, f1 + nop.i 999 ;; +} +{ .mlx + nop.m 999 +// ************************************************* +// ********************* STEP2 ********************* +// ************************************************* +(p0) movl special = 0x8400000000000000 +} +{ .mlx + nop.m 999 +// +// lookup = b_1 b_2 b_3 B_4 +// +(p0) movl special1 = 0x0000000000000100 ;; +} +{ .mfi + nop.m 999 +// +// Do fnorms to raise any denormal operand +// exceptions. +// +(p0) fmpy.s1 P_hi = M, P_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 P_lo = M, P_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Q = E * V +// +(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3 + nop.i 999 ;; +} +{ .mmb +(p0) getf.sig significand_Q = Q +(p0) getf.exp exponent_Q = Q + nop.b 999 ;; +} +{ .mmi + nop.m 999 ;; +(p0) andcm k = 0x0003, exponent_Q +(p0) extr.u lookup = significand_Q, 59, 4 ;; +} +{ .mib + nop.m 999 +(p0) dep special = lookup, special, 59, 4 +// +// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// +(p6) br.cond.spnt L(ATANL_POLY) ;; +} +{ .mfi +(p0) cmp.eq.unc p8, p9 = 0x0000, k +(p0) fmpy.s1 P_hi = s_Y, P_hi +// +// We waited a few extra cycles so P_lo and P_hi could be calculated. +// Load the constant 256 for loading up table entries. +// +// ************************************************* +// ******************** STEP3 ********************** +// ************************************************* +(p0) add table_ptr2 = 16, table_ptr1 +} +// +// Let z_hi have exponent and sign of original Q +// Load the Tbl_hi(0) else, increment pointer. +// +{ .mii +(p0) ldfe Q_4 = [table_ptr1], -16 +(p0) xor swap = sign_X, swap ;; +(p9) sub k = k, r0, 1 +} +{ .mmi +(p0) setf.sig z_hi = special +(p0) ldfe Q_3 = [table_ptr1], -16 +(p9) add table_ptr2 = 16, table_ptr2 ;; +} +// +// U_hold = U - U_prime_hi +// k = k * 256 - Result can be 0, 256, or 512. +// +{ .mmb +(p0) ldfe Q_2 = [table_ptr1], -16 +(p8) ldfd Tbl_hi = [table_ptr2], 8 + nop.b 999 ;; +} +// +// U_prime_lo = U_hold + V * z_hi +// lookup -> lookup * 16 + k +// +{ .mmi +(p0) ldfe Q_1 = [table_ptr1], -16 ;; +(p8) ldfs Tbl_lo = [table_ptr2], 8 +// +// U_prime_hi = U + V * z_hi +// Load the Tbl_lo(0) +// +(p9) pmpy2.r k = k, special1 ;; +} +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) shladd lookup = lookup, 0x0004, k ;; +} +{ .mmi +(p9) add table_ptr2 = table_ptr2, lookup ;; +// +// V_prime = V - U * z_hi +// +(p9) ldfd Tbl_hi = [table_ptr2], 8 + nop.i 999 ;; +} +{ .mmf + nop.m 999 +// +// C_hi = frcpa(1,U_prime_hi) +// +(p9) ldfs Tbl_lo = [table_ptr2], 8 +// +// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// Point to beginning of Tbl_hi entries - k = 0. +// +(p0) fmerge.se z_hi = Q, z_hi ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 U_prime_hi = V, z_hi, U + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnma.s1 V_prime = U, z_hi, V + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) mov A_hi = Tbl_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 U_hold = U, U_prime_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi + nop.i 999 ;; +} +{ .mfi +(p0) cmp.eq.unc p7, p6 = 0x00000, swap +(p0) fmpy.s1 A_hi = s_Y, A_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = wsq * poly +// +(p7) fadd.s1 sigma = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 U_prime_lo = z_hi, V, U_hold + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fsub.s1 sigma = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// A_lo = A_lo + w_hi +// A_hi = s_Y * A_hi +// +(p0) fma.s1 Res_hi = sigma, A_hi, P_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (1) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (1) +// +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (2) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (2) +// +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (3) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (3) +// +(p0) fmpy.s1 w_hi = V_prime, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// w_hi = V_prime * C_hi +// +(p0) fmpy.s1 wsq = w_hi, w_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// wsq = w_hi * w_hi +// w_lo = = V_prime - w_hi * U_prime_hi +// +(p0) fma.s1 poly = wsq, Q_4, Q_3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = Q_3 + wsq * Q_4 +// w_lo = = w_lo - w_hi * U_prime_lo +// +(p0) fma.s1 poly = wsq, poly, Q_2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 w_lo = C_hi, w_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = Q_2 + wsq * poly +// w_lo = = w_lo * C_hi +// +(p0) fma.s1 poly = wsq, poly, Q_1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 A_lo = Tbl_lo, w_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) +// +(p0) fmpy.s0 Q_1 = Q_1, Q_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = Q_1 + wsq * poly +// A_lo = Tbl_lo + w_lo +// swap = xor(swap,sign_X) +// +(p0) fmpy.s1 poly = wsq, poly + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Is (swap) != 0 ? +// poly = wsq * poly +// A_hi = Tbl_hi +// +(p0) fmpy.s1 poly = w_hi, poly + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (PR_1) sigma = -1.0 +// if (PR_2) sigma = 1.0 +// +(p0) fadd.s1 A_lo = A_lo, poly + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// P_hi = s_Y * P_hi +// A_lo = A_lo + poly +// +(p0) fadd.s1 A_lo = A_lo, w_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 Res_lo = sigma, A_lo, P_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Res_hi = P_hi + sigma * A_hi +// Res_lo = P_lo + sigma * A_lo +// +(p0) fma.s0 Result = Res_lo, s_Y, Res_hi +// +// Raise inexact. +// +br.ret.sptk b0 ;; +} +// +// poly1 = P_5 + zsq * poly1 +// poly2 = zsq * poly2 +// +L(ATANL_POLY): +{ .mmf +(p0) xor swap = sign_X, swap + nop.m 999 +(p0) fnma.s1 E_hold = E, U, f1 ;; +} +{ .mfi + nop.m 999 +(p0) mov A_temp = Q +// +// poly1 = P_4 + zsq * poly1 +// swap = xor(swap,sign_X) +// +// sign_X gr_002 +// swap gr_004 +// poly1 = poly1 <== Done with poly1 +// poly1 = P_4 + zsq * poly1 +// swap = xor(swap,sign_X) +// +(p0) cmp.eq.unc p7, p6 = 0x00000, swap +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 P_hi = s_Y, P_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fsub.s1 sigma = f0, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fadd.s1 sigma = f0, f1 + nop.i 999 ;; +} + +// *********************************************** +// ******************** STEP4 ******************** +// *********************************************** + +{ .mmi + nop.m 999 +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 +(p0) fma.s1 E = E, E_hold, E +// +// Following: +// Iterate 3 times E = E + E*(1.0 - E*U) +// Also load P_8, P_7, P_6, P_5, P_4 +// E_hold = 1.0 - E * U (1) +// A_temp = Q +// +(p0) add table_ptr1 = 128, table_ptr1 ;; +} +{ .mmf + nop.m 999 +// +// E = E + E_hold*E (1) +// Point to P_8. +// +(p0) ldfe P_8 = [table_ptr1], -16 +// +// poly = z8*poly1 + poly2 (Typo in writeup) +// Is (swap) != 0 ? +// +(p0) fnma.s1 z_lo = A_temp, U, V ;; +} +{ .mmb + nop.m 999 +// +// E_hold = 1.0 - E * U (2) +// +(p0) ldfe P_7 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmb + nop.m 999 +// +// E = E + E_hold*E (2) +// +(p0) ldfe P_6 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmb + nop.m 999 +// +// E_hold = 1.0 - E * U (3) +// +(p0) ldfe P_5 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmf + nop.m 999 +// +// E = E + E_hold*E (3) +// +// +// At this point E approximates 1/U to roughly working precision +// z = V*E approximates V/U +// +(p0) ldfe P_4 = [table_ptr1], -16 +(p0) fnma.s1 E_hold = E, U, f1 ;; +} +{ .mmb + nop.m 999 +// +// Z = V * E +// +(p0) ldfe P_3 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmb + nop.m 999 +// +// zsq = Z * Z +// +(p0) ldfe P_2 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmb + nop.m 999 +// +// z8 = zsq * zsq +// +(p0) ldfe P_1 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mlx + nop.m 999 +(p0) movl int_temp = 0x24005 +} +{ .mfi + nop.m 999 +(p0) fma.s1 E = E, E_hold, E + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 E_hold = E, U, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 E = E, E_hold, E + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 Z = V, E + nop.i 999 +} +{ .mfi + nop.m 999 +// +// z_lo = V - A_temp * U +// if (PR_2) sigma = 1.0 +// +(p0) fmpy.s1 z_lo = z_lo, E + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 zsq = Z, Z + nop.i 999 +} +{ .mfi + nop.m 999 +// +// z_lo = z_lo * E +// if (PR_1) sigma = -1.0 +// +(p0) fadd.s1 A_hi = A_temp, z_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// z8 = z8 * z8 +// +// +// Now what we want to do is +// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) +// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) +// +(p0) fma.s1 poly1 = zsq, P_8, P_7 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 poly2 = zsq, P_3, P_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 z8 = zsq, zsq + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsub.s1 A_temp = A_temp, A_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// A_lo = Z * poly + z_lo +// +(p0) fmerge.s tmp = A_hi, A_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly1 = P_7 + zsq * P_8 +// poly2 = P_2 + zsq * P_3 +// +(p0) fma.s1 poly1 = zsq, poly1, P_6 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 poly2 = zsq, poly2, P_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 z8 = z8, z8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 z_lo = A_temp, z_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly1 = P_6 + zsq * poly1 +// poly2 = P_2 + zsq * poly2 +// +(p0) fma.s1 poly1 = zsq, poly1, P_5 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 poly2 = poly2, zsq + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Result = Res_hi + Res_lo (User Supplied Rounding Mode) +// +(p0) fmpy.s1 P_5 = P_5, P_5 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 poly1 = zsq, poly1, P_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 poly = z8, poly1, poly2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Fixup added to force inexact later - +// A_hi = A_temp + z_lo +// z_lo = (A_temp - A_hi) + z_lo +// +(p0) fma.s1 A_lo = Z, poly, z_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 A_hi = tmp, A_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 tmp = tmp, A_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 A_hi = s_Y, A_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 A_lo = tmp, A_lo + nop.i 999 +} +{ .mfi +(p0) setf.exp tmp = int_temp +// +// P_hi = s_Y * P_hi +// A_hi = s_Y * A_hi +// +(p0) fma.s1 Res_hi = sigma, A_hi, P_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6,p0 = A_lo, 0x007 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) mov A_lo = tmp + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Res_hi = P_hi + sigma * A_hi +// +(p0) fsub.s1 tmp = P_hi, Res_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// tmp = P_hi - Res_hi +// +(p0) fma.s1 tmp = A_hi, sigma, tmp + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 sigma = A_lo, sigma, P_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// tmp = sigma * A_hi + tmp +// sigma = A_lo * sigma + P_lo +// +(p0) fma.s1 Res_lo = s_Y, sigma, tmp + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Res_lo = s_Y * sigma + tmp +// +(p0) fadd.s0 Result = Res_lo, Res_hi +br.ret.sptk b0 ;; +} +L(ATANL_NATVAL): +L(ATANL_UNSUPPORTED): +L(ATANL_NAN): +{ .mfb + nop.m 999 +(p0) fmpy.s0 Result = ArgX,ArgY +(p0) br.ret.sptk b0 ;; +} +L(ATANL_SPECIAL_HANDLING): +{ .mfi + nop.m 999 +(p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p7 = ArgY, 0x007 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl special = 992 +} +;; + + +{ .mmi + nop.m 999 +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mib +(p0) add table_ptr1 = table_ptr1, special + nop.i 999 +(p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;; +} +{ .mmf +(p0) ldfd Result = [table_ptr1], 8 + nop.m 999 +(p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;; +} +{ .mmf + nop.m 999 +(p0) ldfd Result_lo = [table_ptr1], -8 +(p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;; +} +{ .mfi + nop.m 999 +(p14) fmerge.s Result = ArgY, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fclass.m.unc p13, p0 = ArgX, 0x007 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) fmerge.s Result_lo = ArgY, f0 + nop.i 999 ;; +} +{ .mfi +(p13) mov GR_Parameter_TAG = 36 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Return sign_Y * 0 when ArgX > +0 +// +(p15) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p15) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Return sign_Y * 0 when ArgX < -0 +// +(p0) fadd.s0 Result = Result, Result_lo +(p13) br.cond.spnt __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Call error support funciton for atan(0,0) +// +(p0) br.ret.sptk b0 ;; +} +L(ATANL_ArgY_Not_ZERO): +{ .mfi + nop.m 999 +(p0) fclass.m.unc p9, p10 = ArgY, 0x023 + nop.i 999 ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;; +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p6, p0 = ArgX, 0x017 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p7, p0 = ArgX, 0x021 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p8, p0 = ArgX, 0x022 + nop.i 999 ;; +} +{ .mmi +(p6) add table_ptr1 = 16, table_ptr1 ;; +(p0) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} +{ .mfi +(p0) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p6) fadd.s0 Result = Result, Result_lo +(p6) br.ret.sptk b0 ;; +} +// +// Load PI/2 and adjust its sign. +// Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal +// Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal +// +{ .mmi +(p7) add table_ptr1 = 32, table_ptr1 ;; +(p7) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} +{ .mfi +(p7) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p7) fadd.s0 Result = Result, Result_lo +(p7) br.ret.sptk b0 ;; +} +// +// Load PI/4 and adjust its sign. +// Return +PI/4 when ArgY = +Inf and ArgX = +Inf +// Return -PI/4 when ArgY = -Inf and ArgX = +Inf +// +{ .mmi +(p8) add table_ptr1 = 48, table_ptr1 ;; +(p8) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} +{ .mfi +(p8) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p8) fadd.s0 Result = Result, Result_lo +(p8) br.ret.sptk b0 ;; +} +L(ATANL_ArgY_Not_INF): +{ .mfi + nop.m 999 +// +// Load PI/4 and adjust its sign. +// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf +// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf +// +(p0) fclass.m.unc p6, p0 = ArgX, 0x007 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7, p0 = ArgX, 0x021 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p8, p0 = ArgX, 0x022 + nop.i 999 ;; +} +{ .mmi +(p6) add table_ptr1 = 16, table_ptr1 ;; +(p6) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} +{ .mfi +(p6) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p6) fadd.s0 Result = Result, Result_lo +(p6) br.ret.spnt b0 ;; +} +{ .mfi + nop.m 999 +// +// return = sign_Y * PI/2 when ArgX = 0 +// +(p7) fmerge.s Result = ArgY, f0 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p7) fnorm.s0 Result = Result +(p7) br.ret.spnt b0 ;; +} +// +// return = sign_Y * 0 when ArgX = Inf +// +{ .mmi +(p8) ldfd Result = [table_ptr1], 8 ;; +(p8) ldfd Result_lo = [table_ptr1], -8 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p8) fadd.s0 Result = Result, Result_lo +(p8) br.ret.sptk b0 ;; +} +// +// return = sign_Y * PI when ArgX = -Inf +// +.endp atan2l +ASM_SIZE_DIRECTIVE(atan2l) +ASM_SIZE_DIRECTIVE(__atan2l) +ASM_SIZE_DIRECTIVE(__ieee754_atan2l) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_cbrt.S b/sysdeps/ia64/fpu/s_cbrt.S new file mode 100644 index 0000000000..cb17c46c64 --- /dev/null +++ b/sysdeps/ia64/fpu/s_cbrt.S @@ -0,0 +1,676 @@ +.file "cbrt.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang +// of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 5/19/00: New version (modified algorithm) +// +// API +//============================================================== +// double cbrt(double) +// +// Overview of operation +//============================================================== +// Background +// +// Implementation +// +// cbrt(a) = cbrt(a y) / cbrt(y) +// = cbrt(1 - (1 - a y)) * 1/cbrt(y) +// +// where y = frcpa(a). +// +// * cbrt(1 - (1 - a y)) is approximated by a degree-5 polynomial +// +// 1 - (1/3)*r - (1/9)*r^2 - (5/81)*r^3 - (10/243)*r^4 - (22/729)*r^5 +// +// in r = 1 - a y. +// +// * The values 1/cbrt(y) are stored in a table of constants T0 +// to 64 bits of accuracy +// +// The table values are stored for three exponent values and are +// then multiplied by e/3 where e is the exponent of the input number. +// This computation is carried out in parallel with the polynomial +// evaluation: +// +// T = 2^(e/3) * T0 + + + + + +//=============== +// input = x +// C = frcpa(x) +// r = 1 - C * x +// +// Special values +//============================================================== + + + +// Registers used +//============================================================== +// f6-f15 +// r2, r23-r26, r28-r30 +// p6,p7,p8,p12 + +#include "libm_support.h" + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +poly_coeffs: +ASM_TYPE_DIRECTIVE(poly_coeffs,@object) +data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3 +data8 0x3fbc71c71c71c71d, 0x3faf9add3c0ca459 +data8 0x3fa511e8d2b3183b, 0x3f9ee7113506ac13 +ASM_SIZE_DIRECTIVE(poly_coeffs) + +T_table: +ASM_TYPE_DIRECTIVE(T_table,@object) + +data8 0x80155c748c374836, 0xa160019ed37fb4ae +data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 +data8 0xa1960b5966da4608, 0xcb95f333968ad59b +data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 +data8 0xcbda64292d3ffd97, 0x8096b586974669b1 +data8 0xa202f97995b69c0d, 0xcc1f3184af961596 +data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d +data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 +data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 +data8 0x81149add67c2d208, 0xa2a197e5d10465cb +data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a +data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 +data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 +data8 0xcd6b096a0b70ee87, 0x818ed973b811135e +data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 +data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 +data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a +data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 +data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 +data8 0xce6e0be0cd551a61, 0x823880f78e70b805 +data8 0xa4115ce30548bc15, 0xceb666b2c347d1de +data8 0x826097a62a8e5200, 0xa443df0e53df577a +data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf +data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 +data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 +data8 0xcf763c47ee869f00, 0x82da06a527b18937 +data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 +data8 0x8302e60b635ab394, 0xa5105d46152c938a +data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e +data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 +data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 +data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a +data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 +data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc +data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca +data8 0xa60e1e1a2de14745, 0xd1376458e34b037e +data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 +data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 +data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 +data8 0x844510461ff14209, 0xa6a6444aa0243c0b +data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 +data8 0xa6dc094d10f25792, 0xd23ad555f773f059 +data8 0x84947e18234f3294, 0xa70a574cc02bba69 +data8 0xd2752c7039a5bf73, 0x84bf92755825045a +data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 +data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b +data8 0xd2f4735ffd700280, 0x8509ef44b86f20be +data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 +data8 0x85359d5d91768427, 0xa7d5579ae5164b85 +data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 +data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 +data8 0x858104f0c415f79a, 0xa8345895e5250a5a +data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 +data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 +data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b +data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 +data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 +data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e +data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 +data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 +data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 +data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 +data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 +data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba +data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 +data8 0xa9ea8686f556f645, 0xd614b539c6194104 +data8 0x870453c845acf90f, 0xaa1c52d17906bb19 +data8 0xd6537310e224283f, 0x872c089a1e90342c +data8 0xaa4e59b046dab887, 0xd6927ab62244c917 +data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b +data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 +data8 0xaab319102f3f9b33, 0xd71169cea98fdded +data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 +data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a +data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 +data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e +data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc +data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 +data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e +data8 0xd83e38838648d815, 0x885bc559e5e1c081 +data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 +data8 0x887e2ee392bb7a93, 0xabf864602d7c323d +data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 +data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 +data8 0x88ca5eda67594784, 0xac5861d4aa441f0f +data8 0xd92432bd5a173685, 0x88f4356166bd590e +data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e +data8 0x89173a0acf5ce026, 0xacb93703ff51571e +data8 0xd99e3327cf89574e, 0x893a62a098b6a57b +data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 +data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 +data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 +data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce +data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb +data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 +data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 +data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff +data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 +data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced +data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a +data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 +data8 0xae5794122b638df9, 0xdba843ded7151ea1 +data8 0x8a849aba14274764, 0xae858fda8137ae0a +data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b +data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 +data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 +data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 +data8 0xaf10a899d3235fe7, 0xdc917398f2797814 +data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 +data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c +data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 +data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b +data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de +data8 0xafc35ce063eb3787, 0xdd729ad01c69114d +data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d +data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 +data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 +data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b +data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 +data8 0xb078f3ab1d701c65, 0xde576480262399bc +data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 +data8 0xde943789645933c8, 0x8c5dc4c4f7706032 +data8 0xb0d9b624d62ec856, 0xded14d58139a28af +data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 +data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 +data8 0xb131821882f5540a, 0xdf3feb44d723a713 +data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 +data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 +data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 +data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 +data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee +data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 +data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec +data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 +data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac +data8 0x8d97af6352739cb7, 0xb26538b2db8420dc +data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f +data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d +data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 +data8 0xe1362890eb663139, 0x8e00197e1e7c88fe +data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa +data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f +data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 +data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 +data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 +data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e +data8 0xb3971e9b39264023, 0xe2450559b4d80b6d +data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a +data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad +data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b +data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d +data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff +data8 0xb43da8e9d163e1af, 0xe316d93615862714 +data8 0x8f385c95d696b817, 0xb47233773b84d425 +data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 +data8 0xb49c6825430fe730, 0xe38e38e38e38e38e +data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf +data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 +data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e +data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 +data8 0xe42eeca17c62886c, 0x8fe117499e356095 +data8 0xb546c9616087ab9c, 0xe464e32943446305 +data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 +data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 +data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 +data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 +data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab +data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 +data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 +data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d +data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a +data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 +data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 +data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 +data8 0x9110021e7b516f0a, 0xb6c47044075b4142 +data8 0xe645bd1544c7ea51, 0x912a708a39be9075 +data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 +data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 +data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 +data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 +data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 +data8 0xe70a9136a7403039, 0x91afbc299ed0295d +data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 +data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 +data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 +data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a +data8 0x9212b5fcac537c19, 0xb80a6226904045e2 +data8 0xe7e067453317ed2b, 0x9236f6b256923fcf +data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 +data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 +data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 +data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e +data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d +data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 +data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 +data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a +data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f +data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 +data8 0x931379a403be5c16, 0xb94de2d841a184c2 +data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 +data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e +data8 0x9354c71412c69486, 0xb9a0297f172665e3 +data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 +data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 +data8 0x93968919f6e7975d, 0xb9f3030951267208 +data8 0xea480963fd394197, 0x93bc516fdd4680c9 +data8 0xba229d6a618e7c59, 0xea84034425f27484 +data8 0x93d8c123d9be59b2, 0xba467144459f9855 +data8 0xeab12713138dd1cc, 0x93f546c955e60076 +data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b +data8 0x941b70a65879079f, 0xba9a76056b67ee7a +data8 0xeb1b0268343b121b, 0x943829f337410591 +data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 +data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b +data8 0xeb765721e85f03d0, 0x947b86b57f5842ed +data8 0xbb1385a23be24e57, 0xebb389645f222f62 +data8 0x94988aeb23470f86, 0xbb3814975e17c680 +data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a +data8 0xbb5cc031009bf467, 0xec0fcc9321024509 +data8 0x94d2d7a9170d8b42, 0xbb81889680024764 +data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 +data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 +data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 +data8 0xecaad5278824e453, 0x9534cefa625fcb3a +data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 +data8 0x955265405c491a25, 0xbc223d88cfc88eee +data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 +data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c +data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a +data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c +data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 +data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d +data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 +data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 +data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 +data8 0xee357ead791fc670, 0x962e350575b409c5 +data8 0xbd372f8598620f19, 0xee658cb3c134a463 +data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e +data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d +data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f +data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d +data8 0xeef6a0da64a014ac, 0x96a8426705198795 +data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 +data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 +data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d +data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 +data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 +data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 +data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 +data8 0x97430782be323831, 0xbe93f5b41d047cf7 +data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf +data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d +data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c +data8 0xf0805c944d827454, 0x97a117ffd0f48e46 +data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb +data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c +data8 0xf0e46442e76f6569, 0x97e0505a8637a036 +data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 +data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 +data8 0xf1383fa9e9b5b381, 0x9815503365914a9d +data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b +data8 0x98354085054fd204, 0xbfc52428bec6e72f +data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 +data8 0xbfed838fddab024b, 0xf1d0593311db1757 +data8 0x987571fffb7f94f6, 0xc016050c0420981a +data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 +data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f +data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce +data8 0xf258d095e465cc35, 0x98cbb2d196bd713d +data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 +data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 +data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 +data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e +data8 0x9922b8218160967a, 0xc0f054ca33eb3437 +data8 0xf31670135ab9cc0f, 0x99438d686f75779d +data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb +data8 0x99647eea131fa20b, 0xc1433453de2033ff +data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 +data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 +data8 0x999ba5f14f8add02, 0xc188b130431d80e6 +data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae +data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a +data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 +data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e +data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 +data8 0x9a16154eb445c873, 0xc222f35a87b415ba +data8 0xf498c1076015faf8, 0x9a2c822ec198d667 +data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 +data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 +data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e +data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 +data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 +data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 +data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 +data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff +data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d +data8 0xc323938449a2587e, 0xf5dc1501f324a812 +data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 +data8 0xf6006bee86b5589e, 0x9b1b19033be35730 +data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 +data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 +data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b +data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 +data8 0x9b77854e6c661200, 0xc3e0410243b97383 +data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f +data8 0xc3fd890709833d37, 0xf6eeb177472cedae +data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 +data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 +data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 +data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 +data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 +data8 0xc490f9a94695ba14, 0xf7a874b97927af44 +data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 +data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 +data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 +data8 0x9c568656c0423def, 0xc4f938aec206291a +data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 +data8 0xc52629e899dfd622, 0xf8646bf0defb759e +data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 +data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c +data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f +data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c +data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 +data8 0xc5adf561b91e110a, 0xf90f832c2700c160 +data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa +data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 +data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 +data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 +data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 +data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 +data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 +data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c +data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 +data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e +data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb +data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 +data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b +data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f +data8 0xc70fc0117c641630, 0xfacd431644ce0e40 +data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be +data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 +data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 +data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c +data8 0xfb576c5762024805, 0x9e6ed27594550d2e +data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 +data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d +data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 +data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 +data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f +data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 +data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f +data8 0x9ef976db07288d04, 0xc84b978847a06b87 +data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 +data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 +data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 +data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca +data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e +data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 +data8 0xfd118595143ee273, 0x9f860593d42fd7f3 +data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a +data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 +data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 +data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb +data8 0x9fd383731ca51db9, 0xc95e5112e721582a +data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 +data8 0xc97f06bb49787677, 0xfdde8a67d2613531 +data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 +data8 0xfe07db619e781611, 0xa02eab2c4474b0cd +data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 +data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 +data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d +data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 +data8 0xa07d73ba65e680af, 0xca346d07b045a876 +data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 +data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 +data8 0xa0b24fe89e02602f, 0xca77068257be9bab +data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b +data8 0xca98743ae1c693a8, 0xff411e0ba9db886d +data8 0xa0e77200215909e6, 0xcab9f8122c99a101 +data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 +data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 +data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd +data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b +data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 +ASM_SIZE_DIRECTIVE(T_table) + + + + + + +.align 32 +.global cbrt# + +.section .text +.proc cbrt# +.align 32 +cbrt: + + +{ .mfi + // get significand + getf.sig r23=f8 + // will continue only for normal/denormal numbers + (p0) fclass.nm.unc p12,p0 = f8, 0x1b + // r2 = pointer to C_1,...,C_5 followed by T_table + addl r2 = @ltoff(poly_coeffs), gp +} +{.mfi + // get exponent + getf.exp r24=f8 + // normalize a + fma.s1 f14=f8,f1,f0 + // r29=bias-((2^{12}-1)/3) -63=0xffff-0x555-0x3f=0xfa6b + mov r29=0xfa6b;; +} +{.mlx + mov r25=0x20000 + // r28=2^52 + movl r28=0x8000000000000000;; +} +{.mfb + // load start address for C_1,...,C_5 followed by T_table + ld8 r3=[r2] + (p12) fma.d.s0 f8=f8,f1,f0 + (p12) br.ret.spnt b0 +} +{.mfi + nop.m 0 + // y=frcpa(a) + frcpa.s0 f8,p6=f1,f8 + // p7=1 if denormal input + cmp.gtu p7,p0=r28,r23;; +} +{.mmi + // get exponent + (p7) getf.exp r24=f14 + // get normalized significand + (p7) getf.sig r23=f14 + // r28=bias-(2^{12}-1) + mov r28=0xf000;; +} +{.mii + // get r26=sign + and r26=r24,r25 + // eliminate leading 1 from r23=1st table index + shl r23=r23,1 + // eliminate sign from exponent (r25) + andcm r25=r24,r25;; +} +{.mib + add r2=32,r3 + // r23=1st table index (y_index,8 bits) + shr.u r23=r23,56 + nop.b 0 +} +{.mib + // load C_1 + ldfe f7=[r3],16 + // subtract bias from r25=exponent + sub r25=r25,r28 + nop.b 0;; +} +{.mib + // load C_2, C_3 + ldfpd f9,f10=[r3] + // 1: exponent*=5; // (2^{16}-1)/3=0x5555 + shladd r24=r25,2,r25 + nop.b 0 +} +{.mib + // load C_4, C_5 + ldfpd f11,f12=[r2],16 + // r23=3*y_index + shladd r23=r23,1,r23 + nop.b 0;; +} + +{.mfi + // r30=(5*expon)*16+5*expon=(0x55)*expon + shladd r30=r24,4,r24 + // r=1-a*y + (p6) fnma.s1 f6=f8,f14,f1 + // adjust T_table pointer by 1st index + shladd r2=r23,3,r2;; +} + +{.mii + nop.m 0 + // r24=(0x5500)*expon + shl r24=r30,8;; + // r24=(0x5555)*expon + add r24=r24,r30;; +} +{.mii + // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3 + add r24=r24,r25 + nop.i 0;; + // r24=floor(expon/3) + shr r24=r24,16;; +} +{.mfi + // r28=3*exponent + shladd r28=r24,1,r24 + // r2=r*r + (p6) fma.s1 f13=f6,f6,f0 + // bias exponent + add r24=r29,r24;; +} +{.mfi + // get remainder of exponent/3 : r25-r28 + sub r25=r25,r28 + // c2+c3*r + (p6) fma.s1 f9=f10,f6,f9 + // add sign to exponent + or r24=r24,r26 +} +{.mfi + nop.m 0 + // c4+c5*r + (p6) fma.s1 f11=f12,f6,f11 + nop.i 0;; +} +{.mmi + // f14=sign*2^{exponent/3} + (p6) setf.exp f14=r24 + // adjust T_table pointer by 2nd index + shladd r2=r25,3,r2 + nop.i 0;; +} +{.mmi + // load T + (p6) ldf8 f8=[r2] + nop.m 0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // (c2+c3*r)+r^2*(c4+c5*r) + (p6) fma.s1 f9=f11,f13,f9 + nop.i 0 +} +{.mfi + nop.m 0 + // c1*r + (p6) fma.s1 f7=f7,f6,f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // P=c1*r+r^2*[(c2+c3*r)+r^2*(c4+c5*r)] + (p6) fma.s1 f9=f9,f13,f7 + nop.i 0 +} +{.mfi + nop.m 0 + // T'=T*(2^exp) + (p6) fma.s1 f8=f8,f14,f0 + nop.i 0;; +} +{.mfb + nop.m 0 + // result = T'-T'*P + (p6) fnma.d.s0 f8=f8,f9,f8 + br.ret.sptk b0;; +} +.endp cbrt +ASM_SIZE_DIRECTIVE(cbrt) diff --git a/sysdeps/ia64/fpu/s_cbrtf.S b/sysdeps/ia64/fpu/s_cbrtf.S new file mode 100644 index 0000000000..620bbb50de --- /dev/null +++ b/sysdeps/ia64/fpu/s_cbrtf.S @@ -0,0 +1,655 @@ +.file "cbrtf.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang +// of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 5/18/00: New version (modified algorithm) +// +// API +//============================================================== +// float cbrtf(float) +// +// Overview of operation +//============================================================== +// Background +// +// Implementation +// +// cbrt(a) = cbrt(a y) / cbrt(y) +// = cbrt(1 - (1 - a y)) * 1/cbrt(y) +// +// where y = frcpa(a). +// +// * cbrt(1 - (1 - a y)) is approximated by a degree-2 polynomial +// +// 1 - (1/3)*r - (1/9)*r^2 +// +// in r = 1 - a y. +// +// * The values 1/cbrt(y) are stored in a table of constants T0 +// to 64 bits of accuracy +// +// The table values are stored for three exponent values and are +// then multiplied by e/3 where e is the exponent of the input number. +// This computation is carried out in parallel with the polynomial +// evaluation: +// +// T = 2^(e/3) * T0 + + + + + +//=============== +// input = x +// C = frcpa(x) +// r = 1 - C * x +// +// Special values +//============================================================== + + + +// Registers used +//============================================================== +// f6-f15 +// r2, r23-r26, r28-r30 +// p6,p7,p8,p12 + +#include "libm_support.h" + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +poly_coeffs: +ASM_TYPE_DIRECTIVE(poly_coeffs,@object) +data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3 +data8 0xe38e38e38e38e38e, 0x00003ffb // 1/9 +ASM_SIZE_DIRECTIVE(poly_coeffs) + + +T_table: +ASM_TYPE_DIRECTIVE(T_table,@object) + +data8 0x80155c748c374836, 0xa160019ed37fb4ae +data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 +data8 0xa1960b5966da4608, 0xcb95f333968ad59b +data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 +data8 0xcbda64292d3ffd97, 0x8096b586974669b1 +data8 0xa202f97995b69c0d, 0xcc1f3184af961596 +data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d +data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 +data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 +data8 0x81149add67c2d208, 0xa2a197e5d10465cb +data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a +data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 +data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 +data8 0xcd6b096a0b70ee87, 0x818ed973b811135e +data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 +data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 +data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a +data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 +data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 +data8 0xce6e0be0cd551a61, 0x823880f78e70b805 +data8 0xa4115ce30548bc15, 0xceb666b2c347d1de +data8 0x826097a62a8e5200, 0xa443df0e53df577a +data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf +data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 +data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 +data8 0xcf763c47ee869f00, 0x82da06a527b18937 +data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 +data8 0x8302e60b635ab394, 0xa5105d46152c938a +data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e +data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 +data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 +data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a +data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 +data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc +data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca +data8 0xa60e1e1a2de14745, 0xd1376458e34b037e +data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 +data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 +data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 +data8 0x844510461ff14209, 0xa6a6444aa0243c0b +data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 +data8 0xa6dc094d10f25792, 0xd23ad555f773f059 +data8 0x84947e18234f3294, 0xa70a574cc02bba69 +data8 0xd2752c7039a5bf73, 0x84bf92755825045a +data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 +data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b +data8 0xd2f4735ffd700280, 0x8509ef44b86f20be +data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 +data8 0x85359d5d91768427, 0xa7d5579ae5164b85 +data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 +data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 +data8 0x858104f0c415f79a, 0xa8345895e5250a5a +data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 +data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 +data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b +data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 +data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 +data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e +data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 +data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 +data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 +data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 +data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 +data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba +data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 +data8 0xa9ea8686f556f645, 0xd614b539c6194104 +data8 0x870453c845acf90f, 0xaa1c52d17906bb19 +data8 0xd6537310e224283f, 0x872c089a1e90342c +data8 0xaa4e59b046dab887, 0xd6927ab62244c917 +data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b +data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 +data8 0xaab319102f3f9b33, 0xd71169cea98fdded +data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 +data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a +data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 +data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e +data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc +data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 +data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e +data8 0xd83e38838648d815, 0x885bc559e5e1c081 +data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 +data8 0x887e2ee392bb7a93, 0xabf864602d7c323d +data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 +data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 +data8 0x88ca5eda67594784, 0xac5861d4aa441f0f +data8 0xd92432bd5a173685, 0x88f4356166bd590e +data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e +data8 0x89173a0acf5ce026, 0xacb93703ff51571e +data8 0xd99e3327cf89574e, 0x893a62a098b6a57b +data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 +data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 +data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 +data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce +data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb +data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 +data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 +data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff +data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 +data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced +data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a +data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 +data8 0xae5794122b638df9, 0xdba843ded7151ea1 +data8 0x8a849aba14274764, 0xae858fda8137ae0a +data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b +data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 +data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 +data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 +data8 0xaf10a899d3235fe7, 0xdc917398f2797814 +data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 +data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c +data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 +data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b +data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de +data8 0xafc35ce063eb3787, 0xdd729ad01c69114d +data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d +data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 +data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 +data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b +data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 +data8 0xb078f3ab1d701c65, 0xde576480262399bc +data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 +data8 0xde943789645933c8, 0x8c5dc4c4f7706032 +data8 0xb0d9b624d62ec856, 0xded14d58139a28af +data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 +data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 +data8 0xb131821882f5540a, 0xdf3feb44d723a713 +data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 +data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 +data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 +data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 +data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee +data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 +data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec +data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 +data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac +data8 0x8d97af6352739cb7, 0xb26538b2db8420dc +data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f +data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d +data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 +data8 0xe1362890eb663139, 0x8e00197e1e7c88fe +data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa +data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f +data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 +data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 +data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 +data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e +data8 0xb3971e9b39264023, 0xe2450559b4d80b6d +data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a +data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad +data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b +data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d +data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff +data8 0xb43da8e9d163e1af, 0xe316d93615862714 +data8 0x8f385c95d696b817, 0xb47233773b84d425 +data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 +data8 0xb49c6825430fe730, 0xe38e38e38e38e38e +data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf +data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 +data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e +data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 +data8 0xe42eeca17c62886c, 0x8fe117499e356095 +data8 0xb546c9616087ab9c, 0xe464e32943446305 +data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 +data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 +data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 +data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 +data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab +data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 +data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 +data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d +data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a +data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 +data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 +data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 +data8 0x9110021e7b516f0a, 0xb6c47044075b4142 +data8 0xe645bd1544c7ea51, 0x912a708a39be9075 +data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 +data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 +data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 +data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 +data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 +data8 0xe70a9136a7403039, 0x91afbc299ed0295d +data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 +data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 +data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 +data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a +data8 0x9212b5fcac537c19, 0xb80a6226904045e2 +data8 0xe7e067453317ed2b, 0x9236f6b256923fcf +data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 +data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 +data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 +data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e +data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d +data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 +data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 +data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a +data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f +data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 +data8 0x931379a403be5c16, 0xb94de2d841a184c2 +data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 +data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e +data8 0x9354c71412c69486, 0xb9a0297f172665e3 +data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 +data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 +data8 0x93968919f6e7975d, 0xb9f3030951267208 +data8 0xea480963fd394197, 0x93bc516fdd4680c9 +data8 0xba229d6a618e7c59, 0xea84034425f27484 +data8 0x93d8c123d9be59b2, 0xba467144459f9855 +data8 0xeab12713138dd1cc, 0x93f546c955e60076 +data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b +data8 0x941b70a65879079f, 0xba9a76056b67ee7a +data8 0xeb1b0268343b121b, 0x943829f337410591 +data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 +data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b +data8 0xeb765721e85f03d0, 0x947b86b57f5842ed +data8 0xbb1385a23be24e57, 0xebb389645f222f62 +data8 0x94988aeb23470f86, 0xbb3814975e17c680 +data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a +data8 0xbb5cc031009bf467, 0xec0fcc9321024509 +data8 0x94d2d7a9170d8b42, 0xbb81889680024764 +data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 +data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 +data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 +data8 0xecaad5278824e453, 0x9534cefa625fcb3a +data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 +data8 0x955265405c491a25, 0xbc223d88cfc88eee +data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 +data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c +data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a +data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c +data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 +data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d +data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 +data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 +data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 +data8 0xee357ead791fc670, 0x962e350575b409c5 +data8 0xbd372f8598620f19, 0xee658cb3c134a463 +data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e +data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d +data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f +data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d +data8 0xeef6a0da64a014ac, 0x96a8426705198795 +data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 +data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 +data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d +data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 +data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 +data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 +data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 +data8 0x97430782be323831, 0xbe93f5b41d047cf7 +data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf +data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d +data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c +data8 0xf0805c944d827454, 0x97a117ffd0f48e46 +data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb +data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c +data8 0xf0e46442e76f6569, 0x97e0505a8637a036 +data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 +data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 +data8 0xf1383fa9e9b5b381, 0x9815503365914a9d +data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b +data8 0x98354085054fd204, 0xbfc52428bec6e72f +data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 +data8 0xbfed838fddab024b, 0xf1d0593311db1757 +data8 0x987571fffb7f94f6, 0xc016050c0420981a +data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 +data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f +data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce +data8 0xf258d095e465cc35, 0x98cbb2d196bd713d +data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 +data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 +data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 +data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e +data8 0x9922b8218160967a, 0xc0f054ca33eb3437 +data8 0xf31670135ab9cc0f, 0x99438d686f75779d +data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb +data8 0x99647eea131fa20b, 0xc1433453de2033ff +data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 +data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 +data8 0x999ba5f14f8add02, 0xc188b130431d80e6 +data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae +data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a +data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 +data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e +data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 +data8 0x9a16154eb445c873, 0xc222f35a87b415ba +data8 0xf498c1076015faf8, 0x9a2c822ec198d667 +data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 +data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 +data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e +data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 +data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 +data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 +data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 +data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff +data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d +data8 0xc323938449a2587e, 0xf5dc1501f324a812 +data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 +data8 0xf6006bee86b5589e, 0x9b1b19033be35730 +data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 +data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 +data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b +data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 +data8 0x9b77854e6c661200, 0xc3e0410243b97383 +data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f +data8 0xc3fd890709833d37, 0xf6eeb177472cedae +data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 +data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 +data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 +data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 +data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 +data8 0xc490f9a94695ba14, 0xf7a874b97927af44 +data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 +data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 +data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 +data8 0x9c568656c0423def, 0xc4f938aec206291a +data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 +data8 0xc52629e899dfd622, 0xf8646bf0defb759e +data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 +data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c +data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f +data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c +data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 +data8 0xc5adf561b91e110a, 0xf90f832c2700c160 +data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa +data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 +data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 +data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 +data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 +data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 +data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 +data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c +data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 +data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e +data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb +data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 +data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b +data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f +data8 0xc70fc0117c641630, 0xfacd431644ce0e40 +data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be +data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 +data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 +data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c +data8 0xfb576c5762024805, 0x9e6ed27594550d2e +data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 +data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d +data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 +data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 +data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f +data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 +data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f +data8 0x9ef976db07288d04, 0xc84b978847a06b87 +data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 +data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 +data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 +data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca +data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e +data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 +data8 0xfd118595143ee273, 0x9f860593d42fd7f3 +data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a +data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 +data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 +data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb +data8 0x9fd383731ca51db9, 0xc95e5112e721582a +data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 +data8 0xc97f06bb49787677, 0xfdde8a67d2613531 +data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 +data8 0xfe07db619e781611, 0xa02eab2c4474b0cd +data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 +data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 +data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d +data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 +data8 0xa07d73ba65e680af, 0xca346d07b045a876 +data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 +data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 +data8 0xa0b24fe89e02602f, 0xca77068257be9bab +data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b +data8 0xca98743ae1c693a8, 0xff411e0ba9db886d +data8 0xa0e77200215909e6, 0xcab9f8122c99a101 +data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 +data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 +data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd +data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b +data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 +ASM_SIZE_DIRECTIVE(T_table) + + + + + + +.align 32 +.global cbrtf# + +.section .text +.proc cbrtf# +.align 32 +cbrtf: + + +{ .mfi + getf.sig r28=f8 + // will continue only for normal/denormal numbers +(p0) fclass.nm.unc p12,p7 = f8, 0x1b + // r2 = pointer to C_1,C_2 followed by T_table + addl r2 = @ltoff(poly_coeffs), gp +} +{.mfi + // r29=bias-((2^8-1)/3) -63=0xffff-0x55-0x3f=0xff6b + mov r29=0xff6b + // normalize a + fma.s1 f14=f8,f1,f0 + nop.i 0;; +} +{.mib + nop.m 0 + (p7) cmp.eq p12,p0=r28,r0 + nop.b 0;; +} +{.mfb + // load start address for C_1,C_2 followed by T_table + ld8 r2=[r2] + (p12) fma.s.s0 f8=f8,f1,f0 + (p12) br.ret.spnt b0;; +} +{.mmf + // load C_1 + ldfe f7=[r2],16 + nop.m 0 + // y=frcpa(a) + frcpa.s0 f8,p6=f1,f8;; +} +{.mmi + // load C_2 + ldfe f9=[r2],16 + // r28=bias-(2^8-1) + mov r28=0xff00 + nop.i 0;; +} +{.mmi + // get normalized significand + getf.sig r23=f14 + // get exponent + getf.exp r24=f14 + mov r25=0x20000;; +} +{.mii + // get r26=sign + and r26=r24,r25 + // eliminate leading 1 from r23=1st table index + shl r23=r23,1 + // eliminate sign from exponent (r25) + andcm r25=r24,r25;; +} +{.mfi + // subtract bias from r25=exponent + sub r25=r25,r28 + // r=1-a*y + (p6) fnma.s1 f6=f8,f14,f1 + // r23=1st table index (y_index8 bits) + shr.u r23=r23,56;; +} +{.mii + // 1: exponent*=5; // (2^{16}-1)/3=0x5555 + shladd r24=r25,2,r25 + // r23=3*y_index + shladd r23=r23,1,r23;; + // r30=(5*expon)*16+5*expon=(0x55)*expon + shladd r30=r24,4,r24;; +} +{.mmi + // adjust T_table pointer by 1st index + shladd r2=r23,3,r2;; + // f10=T[0][y] + (p6) ldf8 f10=[r2],8 + // r24=(0x5500)*expon + shl r24=r30,8;; +} +{.mfi + // f11=T[1][y] + (p6) ldf8 f11=[r2],8 + // P_1=C_1+C_2*r + (p6) fma.s1 f7=f9,f6,f7 + // r24=(0x5555)*expon + add r24=r24,r30;; +} +{.mmi + // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3 + add r24=r24,r25;; + // f8=T[2][y] + (p6) ldf8 f8=[r2] + // r24=floor(expon/3) + shr r24=r24,16;; +} +{.mmi + nop.m 0 + // r28=3*exponent + shladd r28=r24,1,r24 + // bias exponent + add r24=r29,r24;; +} +{.mmi + // get remainder of exponent/3 + sub r25=r25,r28 + // add sign to exponent + or r24=r24,r26 + nop.i 0;; +} +{.mfi + nop.m 0 + // P_2=-r*P_1 + (p6) fnma.s1 f6=f7,f6,f0 + // remainder=0 ? + (p6) cmp.eq.unc p7,p8=r0,r25;; +} +{.mfi + // f14=sign*2^{exponent/3} + (p6) setf.exp f14=r24 + nop.f 0 + // remainder = 1 ? + (p8) cmp.eq.unc p8,p12=1,r25;; +} +.pred.rel "mutex",p7,p8 +{.mfi + nop.m 0 + // remainder=0 -> use T=f10 + (p7) fma.s1 f8=f10,f6,f10 + nop.i 0 +} +{.mfi + nop.m 0 + // remainder =1 -> use f11 + (p8) fma.s1 f8=f11,f6,f11 + nop.i 0;; +} +{.mfi + nop.m 0 + // result=T+T*P_2 + (p12) fma.s.s0 f8=f8,f6,f8 + nop.i 0;; +} +{.mfb + nop.m 0 + // T*=sgn*2^{expon/3} + (p6) fma.s.s0 f8=f8,f14,f0 + br.ret.sptk b0;; +} +.endp cbrtf +ASM_SIZE_DIRECTIVE(cbrtf) diff --git a/sysdeps/ia64/fpu/s_cbrtl.S b/sysdeps/ia64/fpu/s_cbrtl.S new file mode 100644 index 0000000000..c44ecf7065 --- /dev/null +++ b/sysdeps/ia64/fpu/s_cbrtl.S @@ -0,0 +1,889 @@ +.file "cbrtl.asm" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang +// of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 4/28/00: Initial version +// +// API +//============================================================== +// long double cbrtl(long double) +// +// Overview of operation +//============================================================== +// Background +// +// Implementation +// +// cbrt(a) = cbrt(a y) / cbrt(y) +// = cbrt(1 - (1 - a y)) * 1/cbrt(y) +// +// where y = frcpa(a). +// +// * cbrt(1 - (1 - a y)) is approximated by a degree-6 polynomial +// +// 1 - c_1 r - c_2 * r^2 - c_3 * r^3 - c_4 * r^4 - c_5 * r^5 - c_6 * r^6 +// +// in r = 1 - a y. +// +// * The values 1/cbrt(y) are stored as two tables of constants T_hi +// (double-extended precision) and D (single precision) as follows: +// +// T_hi (1 + D) = 1/cbrt(y) to about 80 bits of accuracy +// +// The tables are only stored for three exponent values and are +// then multiplied by e/3 where e is the exponent of the input number. +// This computation is carried out in parallel with the polynomial +// evaluation: +// +// T = 2^(e/3) * T_hi + + + + + +//=============== +// input = x +// C = frcpa(x) +// r = C * x - 1 +// +// Special values +//============================================================== + + + +// Registers used +//============================================================== +// f6-f15 +// r2-r3, r23-r30 +// p6,p7,p12 + +#include "libm_support.h" + +// Data tables +//============================================================== + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +poly_coeffs: +ASM_TYPE_DIRECTIVE(poly_coeffs,@object) +data8 0xaaaaaaaaaaaaaab1, 0x00003ffd // C_1 +data8 0xe38e38e38e38e3e0, 0x00003ffb // C_2 +data8 0x3faf9add3c0be9a6, 0x3fa511e8d2b1f749 // C_3, C_4 +data8 0x3f9ee71b2c6ebe99, 0x3f9809180fd0340c // C_5, C_6 +ASM_SIZE_DIRECTIVE(poly_coeffs) + +T_table: +ASM_TYPE_DIRECTIVE(T_table,@object) + +data8 0x80155c748c374836, 0x8040404b0879f7f9 +data8 0x806b5dce4b405c10, 0x8096b586974669b1 +data8 0x80bcd273d952a028, 0x80e898c52813f2f3 +data8 0x81149add67c2d208, 0x813b4e2c856b6e9a +data8 0x8167c1dde03de7aa, 0x818ed973b811135e +data8 0x81bbc0c33e13ec98, 0x81e33e69fbe7504a +data8 0x820aec524e3c23e9, 0x823880f78e70b805 +data8 0x826097a62a8e5200, 0x8288dfe00e9b5eaf +data8 0x82b15a10c5371624, 0x82da06a527b18937 +data8 0x8302e60b635ab394, 0x832bf8b2feec2f0e +data8 0x83553f0ce00e276b, 0x837eb98b50f8322a +data8 0x83a270f44c84f699, 0x83cc4d7cfcfac5ca +data8 0x83f65f78a8872b4c, 0x8420a75f2f7b53c8 +data8 0x844510461ff14209, 0x846fbd91b930bed2 +data8 0x84947e18234f3294, 0x84bf92755825045a +data8 0x84e4ac0ee112ba51, 0x8509ef44b86f20be +data8 0x85359d5d91768427, 0x855b3bd5b7384357 +data8 0x858104f0c415f79a, 0x85a6f90390d29864 +data8 0x85d3772fcd56a1dd, 0x85f9c982fcc002f3 +data8 0x862047e0e7ea554b, 0x8646f2a26f7f5852 +data8 0x866dca21754096b5, 0x8694ceb8dfd17a37 +data8 0x86bc00c49e9307e8, 0x86dccd74fce79610 +data8 0x870453c845acf90f, 0x872c089a1e90342c +data8 0x8753ec4a92d16c5e, 0x877bff3aca19f6b4 +data8 0x879d88b6fe1c324c, 0x87c5f346dbf98c3a +data8 0x87e7c653efacef2c, 0x881089d4e73ffefc +data8 0x88397e6a366f2a8a, 0x885bc559e5e1c081 +data8 0x887e2ee392bb7a93, 0x88a7a8587e404257 +data8 0x88ca5eda67594784, 0x88f4356166bd590e +data8 0x89173a0acf5ce026, 0x893a62a098b6a57b +data8 0x895daf637236ae2c, 0x89883b9d1c2fa9c5 +data8 0x89abd8dd374a5d7b, 0x89cf9b1dcd197fa0 +data8 0x89f382a258ea79de, 0x8a178faf06648f29 +data8 0x8a3bc288b3e1d18a, 0x8a601b74f4d1f835 +data8 0x8a849aba14274764, 0x8aa9409f16cdbc9b +data8 0x8ace0d6bbe2cb316, 0x8af301688ab33558 +data8 0x8b181cdebe6f3206, 0x8b3d60185fafcb7c +data8 0x8b62cb603bb2fad0, 0x8b80d7d6bc4104de +data8 0x8ba68bf73ac74f39, 0x8bcc68fb9f9f7335 +data8 0x8bf26f31c534fca2, 0x8c10f86e13a1a1f9 +data8 0x8c3749916cc6abb5, 0x8c5dc4c4f7706032 +data8 0x8c7cac3a8c42e3e0, 0x8ca373f1b7bf2716 +data8 0x8cc29907fb951294, 0x8ce9ae4e9492aac8 +data8 0x8d0911dddbfdad0e, 0x8d3075c4f20f04ee +data8 0x8d5018a9d4de77d5, 0x8d77cc47dd143515 +data8 0x8d97af6352739cb7, 0x8db7af523167800f +data8 0x8ddfd80bc68c32ff, 0x8e00197e1e7c88fe +data8 0x8e207859f77e20e7, 0x8e40f4ce60c9f8e2 +data8 0x8e69ba46cf2fde4d, 0x8e8a7a00bd7ae63e +data8 0x8eab57ef1cf2f529, 0x8ecc5442cffb1dad +data8 0x8eed6f2d2a4acbfe, 0x8f0ea8dff24441ff +data8 0x8f385c95d696b817, 0x8f59dc43edd930f3 +data8 0x8f7b7b5f5ffad1c4, 0x8f9d3a1bea165f38 +data8 0x8fbf18adc34b66da, 0x8fe117499e356095 +data8 0x90033624aa685f8d, 0x9025757495f36b86 +data8 0x903f3a5dcc091203, 0x9061b2fceb2bdbab +data8 0x90844ca7211032a7, 0x90a7079403e6a15d +data8 0x90c9e3fbafd63799, 0x90ece216c8a16ee4 +data8 0x9110021e7b516f0a, 0x912a708a39be9075 +data8 0x914dcc7b31146370, 0x91714af8cfe984d5 +data8 0x918c00a6f3795e97, 0x91afbc299ed0295d +data8 0x91d39add3e958db0, 0x91ee9920a8974d92 +data8 0x9212b5fcac537c19, 0x9236f6b256923fcf +data8 0x92523ee6f90dcfc3, 0x9276bef031e6eb79 +data8 0x929236ec237a24ad, 0x92b6f70b7efe9dc3 +data8 0x92d29f61eec7dc2b, 0x92f7a05d5b8ba92f +data8 0x931379a403be5c16, 0x9338bc44de2e3f34 +data8 0x9354c71412c69486, 0x937a4c273907e262 +data8 0x93968919f6e7975d, 0x93bc516fdd4680c9 +data8 0x93d8c123d9be59b2, 0x93f546c955e60076 +data8 0x941b70a65879079f, 0x943829f337410591 +data8 0x9454f995765bc4d2, 0x947b86b57f5842ed +data8 0x94988aeb23470f86, 0x94b5a5dc9695f42a +data8 0x94d2d7a9170d8b42, 0x94f9e87dd78bf019 +data8 0x95175019a503d89e, 0x9534cefa625fcb3a +data8 0x955265405c491a25, 0x9570130c1f9bb857 +data8 0x9597ca4119525184, 0x95b5af6fb5aa4d3c +data8 0x95d3ac9273aafd7a, 0x95f1c1cafdfd3684 +data8 0x960fef3b430b8d5f, 0x962e350575b409c5 +data8 0x964c934c0dfc1708, 0x966b0a31c9c6bc7d +data8 0x968999d9ad8d264e, 0x96a8426705198795 +data8 0x96c703fd64445ee5, 0x96e5dec0a7b4268d +data8 0x9704d2d4f59f79f3, 0x9723e05ebe91b9b0 +data8 0x97430782be323831, 0x97624865fc0df8bf +data8 0x9781a32dcc640b2a, 0x97a117ffd0f48e46 +data8 0x97c0a701f9d263c9, 0x97e0505a8637a036 +data8 0x97f57a9fb0b08c6e, 0x9815503365914a9d +data8 0x98354085054fd204, 0x98554bbbf8a77902 +data8 0x987571fffb7f94f6, 0x9895b3791dd03c23 +data8 0x98ab43a5fc65d0c8, 0x98cbb2d196bd713d +data8 0x98ec3d9ec7b6f21a, 0x990ce436db5e8344 +data8 0x9922b8218160967a, 0x99438d686f75779d +data8 0x99647eea131fa20b, 0x997a85045a47c6d0 +data8 0x999ba5f14f8add02, 0x99bce38b5465ecae +data8 0x99d31ca0887f30f9, 0x99f48a669c74c09e +data8 0x9a16154eb445c873, 0x9a2c822ec198d667 +data8 0x9a4e3e080cd91b78, 0x9a70177afe52322e +data8 0x9a86b8fa94eebe10, 0x9aa8c42866ae2958 +data8 0x9abf86f9e12fc45e, 0x9ae1c462fc05f49d +data8 0x9af8a8dc936b84d0, 0x9b1b19033be35730 +data8 0x9b3da7daf04c2892, 0x9b54c2e4c8a9012b +data8 0x9b77854e6c661200, 0x9b8ec2e678d56d2f +data8 0x9ba60e6a5ca133b6, 0x9bc919ea66a151a4 +data8 0x9be0887c09ef82bb, 0x9c03c8d5fffc3503 +data8 0x9c1b5ad21a81cbb9, 0x9c3ed09216e9ca02 +data8 0x9c568656c0423def, 0x9c7a320af242ce60 +data8 0x9c920bf7a8c01dc2, 0x9ca9f475d98b159c +data8 0x9ccdeca60e80b5f8, 0x9ce5f9d4653d4902 +data8 0x9cfe15cb38bfdd8e, 0x9d225b983f6c1f96 +data8 0x9d3a9cca32261ed7, 0x9d52ecfccebe1768 +data8 0x9d77818d95b82f86, 0x9d8ff7893fa4706c +data8 0x9da87cbef36f2a5e, 0x9dcd6140b4a35aeb +data8 0x9de60cd06dc6e2d4, 0x9dfec7d4cc43b76f +data8 0x9e17925ec9fccc4a, 0x9e3cdf6db57dc075 +data8 0x9e55d110b63637a8, 0x9e6ed27594550d2e +data8 0x9e87e3adc385d393, 0x9ead9b54b37a1055 +data8 0x9ec6d46a3d7de215, 0x9ee01d9108be3154 +data8 0x9ef976db07288d04, 0x9f12e05a4759ec25 +data8 0x9f2c5a20f4da6668, 0x9f52af78ed1733ca +data8 0x9f6c52426a39d003, 0x9f860593d42fd7f3 +data8 0x9f9fc97fdb96bd51, 0x9fb99e194f4a7037 +data8 0x9fd383731ca51db9, 0x9fed79a04fbf9423 +data8 0xa00780b413b24ee8, 0xa02eab2c4474b0cd +data8 0xa048dcd51ccfd142, 0xa0631fa894b11b8d +data8 0xa07d73ba65e680af, 0xa097d91e6aaf71b0 +data8 0xa0b24fe89e02602f, 0xa0ccd82d1bd2f68b +data8 0xa0e77200215909e6, 0xa1021d760d584855 +data8 0xa11cdaa36068a57d, 0xa137a99cbd3f880b +data8 0xa160019ed37fb4ae, 0xa1960b5966da4608 +data8 0xa1cc5dbe6dc2aab4, 0xa202f97995b69c0d +data8 0xa232fe6eb0c0577d, 0xa26a2582012f6e17 +data8 0xa2a197e5d10465cb, 0xa2d25a532efefbc8 +data8 0xa30a5bd6e49e4ab8, 0xa33b9c9b59879e24 +data8 0xa3742fca6a3c1f21, 0xa3a5f1273887bf22 +data8 0xa3d7ef508ff11574, 0xa4115ce30548bc15 +data8 0xa443df0e53df577a, 0xa4769fa5913c0ec3 +data8 0xa4a99f303bc7def5, 0xa4dcde37779adf4b +data8 0xa5105d46152c938a, 0xa5441ce89825cb8d +data8 0xa5781dad3e54d899, 0xa5ac602406c4e68c +data8 0xa5d9601d95c2c0bc, 0xa60e1e1a2de14745 +data8 0xa6431f6e3fbd9658, 0xa67864b0d432fda4 +data8 0xa6a6444aa0243c0b, 0xa6dc094d10f25792 +data8 0xa70a574cc02bba69, 0xa7409e2af9549084 +data8 0xa76f5c64ca2cf13b, 0xa79e4f0babab5dc0 +data8 0xa7d5579ae5164b85, 0xa804bd3c6fe61cc8 +data8 0xa8345895e5250a5a, 0xa8642a122b44ef0b +data8 0xa89c38ca18f6108b, 0xa8cc81063b6e87ca +data8 0xa8fd00bfa409285e, 0xa92db8664d5516da +data8 0xa95ea86b75cc2c20, 0xa98fd141a4992deb +data8 0xa9c1335cae7446ba, 0xa9ea8686f556f645 +data8 0xaa1c52d17906bb19, 0xaa4e59b046dab887 +data8 0xaa809b9c60d1890b, 0xaab319102f3f9b33 +data8 0xaadd5a18c1e21274, 0xab1045f2ac31bdf5 +data8 0xab3ae3ab2df7231e, 0xab6e3f945d1e96fc +data8 0xaba1d953a08fa94e, 0xabcd090db7ef4c3f +data8 0xabf864602d7c323d, 0xac2ca5886ccf9b57 +data8 0xac5861d4aa441f0f, 0xac8d183fe3a2fbed +data8 0xacb93703ff51571e, 0xace5830ad0c3f14b +data8 0xad11fca5d78b3ff2, 0xad4797fddf91a798 +data8 0xad747701e559ebcb, 0xada184a47e9c7613 +data8 0xadcec13ab0dda8ff, 0xadfc2d1a5fd21ba8 +data8 0xae29c89a5053c33a, 0xae5794122b638df9 +data8 0xae858fda8137ae0a, 0xaeb3bc4ccc56d3d1 +data8 0xaee219c374c09920, 0xaf10a899d3235fe7 +data8 0xaf3f692c341fe8b4, 0xaf6e5bd7db9ae6c2 +data8 0xaf9d80fb081cd91b, 0xafc35ce063eb3787 +data8 0xaff2ddcb5f28f03d, 0xb022923b148e05c5 +data8 0xb0527a919adbf58b, 0xb078f3ab1d701c65 +data8 0xb0a93a6870649f31, 0xb0d9b624d62ec856 +data8 0xb100a5f53fb3c8e1, 0xb131821882f5540a +data8 0xb158bf8e4cb04055, 0xb189fd69d56b238f +data8 0xb1b189958e8108e4, 0xb1e32a8165b09832 +data8 0xb20b0678fc271eec, 0xb23d0bd3f7592b6e +data8 0xb26538b2db8420dc, 0xb28d89e339ceca14 +data8 0xb2c022ca12e55a16, 0xb2e8c6852c6b03f1 +data8 0xb3118f4eda9fe40f, 0xb33a7d6268109ebe +data8 0xb36ddbc5ea70ec55, 0xb3971e9b39264023 +data8 0xb3c0877ecc18e24a, 0xb3ea16ae3a6c905f +data8 0xb413cc67aa0e4d2d, 0xb43da8e9d163e1af +data8 0xb47233773b84d425, 0xb49c6825430fe730 +data8 0xb4c6c46bcdb27dcf, 0xb4f1488c0b35d26f +data8 0xb51bf4c7c51f0168, 0xb546c9616087ab9c +data8 0xb571c69bdffd9a70, 0xb59cecbae56984c3 +data8 0xb5bd64512bb14bb7, 0xb5e8d2a4bf5ba416 +data8 0xb6146a9a1bc47819, 0xb6402c7749d621c0 +data8 0xb66c1882fb435ea2, 0xb6982f048c999a56 +data8 0xb6c47044075b4142, 0xb6e5bd6bfd02bafd +data8 0xb7124a2736ff8ef2, 0xb73f026a01e94177 +data8 0xb760a959f1d0a7a7, 0xb78dae7e06868ab0 +data8 0xb7badff8ad9e4e02, 0xb7dce25b8e17ae9f +data8 0xb80a6226904045e2, 0xb8380f1cafd73c1c +data8 0xb85a6ea8e321b4d8, 0xb8886b684ae7d2fa +data8 0xb8ab0726fa00cf5d, 0xb8d954a4d13b7cb1 +data8 0xb8fc2d4f6cd9f04a, 0xb92acc851476b1ab +data8 0xb94de2d841a184c2, 0xb97cd4c36c92693c +data8 0xb9a0297f172665e3, 0xb9cf6f21e36c3924 +data8 0xb9f3030951267208, 0xba229d6a618e7c59 +data8 0xba467144459f9855, 0xba6a60c3c48f1a4b +data8 0xba9a76056b67ee7a, 0xbabea699563ada6e +data8 0xbae2f350b262cc4b, 0xbb1385a23be24e57 +data8 0xbb3814975e17c680, 0xbb5cc031009bf467 +data8 0xbb81889680024764, 0xbbb2c0d8703ae95d +data8 0xbbd7cd09ba3c5463, 0xbbfcf68c4977718f +data8 0xbc223d88cfc88eee, 0xbc47a2284fee4ff8 +data8 0xbc79ac0916ed7b8a, 0xbc9f5670d1a13030 +data8 0xbcc51f068cb95c1d, 0xbceb05f4b30a9bc0 +data8 0xbd110b6604c7d306, 0xbd372f8598620f19 +data8 0xbd5d727edb6b3c7e, 0xbd83d47d937bbc6d +data8 0xbdaa55addf1ae47d, 0xbdd0f63c36aa73f0 +data8 0xbdf7b6556d550a15, 0xbe1e9626b1ffa96b +data8 0xbe4595dd903e5371, 0xbe6cb5a7f14bc935 +data8 0xbe93f5b41d047cf7, 0xbebb5630bae4c15f +data8 0xbee2d74cd30a430c, 0xbf0a7937cf38d981 +data8 0xbf323c217be2bc8c, 0xbf5a203a09342bbb +data8 0xbf74cad1c14ebfc4, 0xbf9ce6a497a89f78 +data8 0xbfc52428bec6e72f, 0xbfed838fddab024b +data8 0xc016050c0420981a, 0xc03ea8cfabddc330 +data8 0xc059d3cbd65ddbce, 0xc082b122a3c78c9d +data8 0xc0abb1499ae736c4, 0xc0d4d474c3aedaaf +data8 0xc0f054ca33eb3437, 0xc119b2c67e600ed0 +data8 0xc1433453de2033ff, 0xc15ef3e44e10032d +data8 0xc188b130431d80e6, 0xc1b2929d6067730e +data8 0xc1ce9268f31cc734, 0xc1f8b0877c1b0c08 +data8 0xc222f35a87b415ba, 0xc23f3467349e5c88 +data8 0xc269b4e40e088c01, 0xc2945aac24daaf6e +data8 0xc2b0de05e43c1d66, 0xc2dbc275e1229d09 +data8 0xc2f86fca9d80eeff, 0xc323938449a2587e +data8 0xc3406b40a538ed20, 0xc36bcee8211d15e0 +data8 0xc397593adf2ba366, 0xc3b475b6206155d5 +data8 0xc3e0410243b97383, 0xc3fd890709833d37 +data8 0xc41ae295f7e7fa06, 0xc44709f7bb8a4dd2 +data8 0xc4648fb0e0bec4c1, 0xc490f9a94695ba14 +data8 0xc4aeac0173b7d390, 0xc4db5941007aa853 +data8 0xc4f938aec206291a, 0xc52629e899dfd622 +data8 0xc54436e44043b965, 0xc562563abf9ea07f +data8 0xc58fa7d1dc42921c, 0xc5adf561b91e110a +data8 0xc5cc5591bdbd82fa, 0xc5fa08f1ff20593c +data8 0xc618980a79ce6862, 0xc6373a09e34b50fa +data8 0xc66550a6e0baaf35, 0xc6842241926342c9 +data8 0xc6a3070b7c93bb9e, 0xc6d18260bb84081b +data8 0xc6f0977c9416828b, 0xc70fc0117c641630 +data8 0xc72efc34d7e615be, 0xc75dfb441594141e +data8 0xc77d68aa019bda4c, 0xc79ce9ea478dbc4f +data8 0xc7bc7f1ae453219d, 0xc7ec0476e15e141a +data8 0xc80bcbe16f1d540f, 0xc82ba78a5d349735 +data8 0xc84b978847a06b87, 0xc86b9bf1ee817bc6 +data8 0xc88bb4de3667cdf4, 0xc8bc00e7fe9e23a3 +data8 0xc8dc4d7ff2d25232, 0xc8fcaeebcb40eb47 +data8 0xc91d25431426a663, 0xc93db09d7fdb2949 +data8 0xc95e5112e721582a, 0xc97f06bb49787677 +data8 0xc99fd1aecd6e1b06, 0xc9d12a3e27bb1625 +data8 0xc9f22ad82ba3d5f0, 0xca134113105e67b2 +data8 0xca346d07b045a876, 0xca55aecf0e94bb88 +data8 0xca77068257be9bab, 0xca98743ae1c693a8 +data8 0xcab9f8122c99a101, 0xcadb9221e268c3b5 +data8 0xcafd4283d8043dfd, 0xcb1f09520d37c6fb +data8 0xcb51ddcb9e93095e, 0xcb95f333968ad59b +data8 0xcbda64292d3ffd97, 0xcc1f3184af961596 +data8 0xcc5bb1ac954d33e2, 0xcca12e9831fc6402 +data8 0xcce70a67b64f24ad, 0xcd24794726477ea5 +data8 0xcd6b096a0b70ee87, 0xcda9177738b15a90 +data8 0xcdf05f2247dffab9, 0xce2f0f347f96f906 +data8 0xce6e0be0cd551a61, 0xceb666b2c347d1de +data8 0xcef609b0cb874f00, 0xcf35fb5447e5c765 +data8 0xcf763c47ee869f00, 0xcfb6cd3888d71785 +data8 0xcff7aed4fbfbb447, 0xd038e1ce5167e3c6 +data8 0xd07a66d7bfa0ebba, 0xd0bc3ea6b32d1b21 +data8 0xd0f4f0e8f36c1bf8, 0xd1376458e34b037e +data8 0xd17a2ca133f78572, 0xd1bd4a80301c5715 +data8 0xd1f71682b2fa4575, 0xd23ad555f773f059 +data8 0xd2752c7039a5bf73, 0xd2b98ee008c06b59 +data8 0xd2f4735ffd700280, 0xd32f99ed6d9ac0e1 +data8 0xd374f0666c75d51c, 0xd3b0a7d13618e4a1 +data8 0xd3eca2ea53bcec0c, 0xd428e23874f13a17 +data8 0xd46f82fe293bc6d3, 0xd4ac57e9b7186420 +data8 0xd4e972becb04e8b8, 0xd526d40a7a9b43a3 +data8 0xd5647c5b73917370, 0xd5a26c4201bd6d13 +data8 0xd5e0a45015350a7e, 0xd614b539c6194104 +data8 0xd6537310e224283f, 0xd6927ab62244c917 +data8 0xd6d1ccc1fc4ef4b7, 0xd71169cea98fdded +data8 0xd746a66a5bc9f6d9, 0xd786ce8f0fae5317 +data8 0xd7bc7ff214c4e75a, 0xd7fd35467a517ed1 +data8 0xd83e38838648d815, 0xd874a1db598b8951 +data8 0xd8ab42205b80edaf, 0xd8ed1849d202f965 +data8 0xd92432bd5a173685, 0xd9669ca45b03c23e +data8 0xd99e3327cf89574e, 0xd9d602b19b100466 +data8 0xda0e0ba86c096841, 0xda5195fcdb1c3dce +data8 0xda8a1eb87a491f6c, 0xdac2e230b91c3f84 +data8 0xdafbe0d0b66aea30, 0xdb351b04a8fafced +data8 0xdb6e9139e33cdd8e, 0xdba843ded7151ea1 +data8 0xdbe2336319b61fc8, 0xdc1c60376789fa68 +data8 0xdc56cacda82d0cd5, 0xdc917398f2797814 +data8 0xdccc5b0d90a3e628, 0xdd0781a10469f0f2 +data8 0xdd42e7ca0b52838f, 0xdd729ad01c69114d +data8 0xddae749c001fbf5e, 0xddea8f50a51c69b1 +data8 0xde26eb69a0f0f111, 0xde576480262399bc +data8 0xde943789645933c8, 0xded14d58139a28af +data8 0xdf025c00bbf2b5c7, 0xdf3feb44d723a713 +data8 0xdf715bc16c159be0, 0xdfaf66240e29cda8 +data8 0xdfe139cbf6e19bdc, 0xe01fc0fe94d9fc52 +data8 0xe051f92ffcc0bd60, 0xe090feec9c9a06ac +data8 0xe0c39d0c9ff862d6, 0xe0f668eeb99f188d +data8 0xe1362890eb663139, 0xe1695c7212aecbaa +data8 0xe19cbf0391bbbbe9, 0xe1d050901c531e85 +data8 0xe2110903b4f4047a, 0xe2450559b4d80b6d +data8 0xe27931a231554ef3, 0xe2ad8e2ac3c5b04b +data8 0xe2e21b41b9694cce, 0xe316d93615862714 +data8 0xe3590bd86a0d30f9, 0xe38e38e38e38e38e +data8 0xe3c397d1e6db7839, 0xe3f928f5953feb9e +data8 0xe42eeca17c62886c, 0xe464e32943446305 +data8 0xe49b0ce15747a8a2, 0xe4d16a1eee94e9d4 +data8 0xe4fa52107353f67d, 0xe5310a471f4d2dc3 +data8 0xe567f6f1c2b9c224, 0xe59f18689a9e4c9a +data8 0xe5d66f04b8a68ecf, 0xe60dfb2005c192e9 +data8 0xe645bd1544c7ea51, 0xe66fb21b505b20a0 +data8 0xe6a7d32af4a7c59a, 0xe6e02b129c6a5ae4 +data8 0xe70a9136a7403039, 0xe74349fb2d92a589 +data8 0xe77c3a9c86ed7d42, 0xe7a713f88151518a +data8 0xe7e067453317ed2b, 0xe819f37a81871bb5 +data8 0xe8454236bfaeca14, 0xe87f32f24c3fc90e +data8 0xe8aacd8688892ba6, 0xe8e523fd32f606f7 +data8 0xe9110b5311407927, 0xe94bc8bf0c108fa3 +data8 0xe977fdc439c2ca3c, 0xe9b3236528fc349e +data8 0xe9dfa70b745ac1b4, 0xea1b36268d0eaa38 +data8 0xea480963fd394197, 0xea84034425f27484 +data8 0xeab12713138dd1cc, 0xeade6db73a5e503b +data8 0xeb1b0268343b121b, 0xeb489b0b2bdb5f14 +data8 0xeb765721e85f03d0, 0xebb389645f222f62 +data8 0xebe198f090607e0c, 0xec0fcc9321024509 +data8 0xec3e247da8b82f61, 0xec7c27d21321c9f7 +data8 0xecaad5278824e453, 0xecd9a76d097d4e77 +data8 0xed089ed5dcd99446, 0xed37bb95add09a1c +data8 0xed76c70508f904b6, 0xeda63bb05e7f93c6 +data8 0xedd5d661daed2dc4, 0xee05974eef86b903 +data8 0xee357ead791fc670, 0xee658cb3c134a463 +data8 0xee95c1987f080211, 0xeec61d92d8c4314f +data8 0xeef6a0da64a014ac, 0xef274ba72a07c811 +data8 0xef581e31a2c91260, 0xef8918b2bc43aec6 +data8 0xefba3b63d89d7cbf, 0xefeb867ecffaa607 +data8 0xf01cfa3df1b9c9fa, 0xf04e96dc05b43e2d +data8 0xf0805c944d827454, 0xf0b24ba285c495cb +data8 0xf0e46442e76f6569, 0xf116a6b2291d7896 +data8 0xf1383fa9e9b5b381, 0xf16ac84f90083b9b +data8 0xf19d7b686dcb03d7, 0xf1d0593311db1757 +data8 0xf20361ee8f1c711e, 0xf23695da7de51d3f +data8 0xf258d095e465cc35, 0xf28c4d0bfc982b34 +data8 0xf2bff55eb3f0ea71, 0xf2f3c9cf9884636e +data8 0xf31670135ab9cc0f, 0xf34a8e9f0b54cdfb +data8 0xf37ed9fa6b8add3f, 0xf3a1cfe884ef6bb6 +data8 0xf3d66689dcc8e8d3, 0xf40b2ab069d5c96a +data8 0xf42e718b90c8bc16, 0xf463822a0a3b4b00 +data8 0xf498c1076015faf8, 0xf4bc5a19a33990b5 +data8 0xf4f1e6a7d6f5425f, 0xf527a232cf6be334 +data8 0xf54b8ecdcda90851, 0xf5819949c7ad87b4 +data8 0xf5a5bac9213b48a9, 0xf5dc1501f324a812 +data8 0xf6006bee86b5589e, 0xf63716b2fa067fa4 +data8 0xf66df22fb6132b9c, 0xf6929fb98225deb1 +data8 0xf6c9cd13021e3fea, 0xf6eeb177472cedae +data8 0xf713abf4cb0b3afb, 0xf74b4d5333684ef1 +data8 0xf7707f75a72f8e94, 0xf7a874b97927af44 +data8 0xf7cddf140aedf1d8, 0xf806291bacb7f7a9 +data8 0xf82bcc43b92eafef, 0xf8646bf0defb759e +data8 0xf88a487dfc3ff5f7, 0xf8b03c2b46cdc17f +data8 0xf8e95541c152ae7a, 0xf90f832c2700c160 +data8 0xf935c88e0c7f419b, 0xf96f5cd84fd86873 +data8 0xf995dd53ebdd9d6d, 0xf9bc75a034436a41 +data8 0xf9f686f26d5518de, 0xfa1d5b39b910a8c5 +data8 0xfa4447acc4ecbfd2, 0xfa7ed7e51e6fdfb4 +data8 0xfaa601394d49a1a0, 0xfacd431644ce0e40 +data8 0xfaf49d96f7a75909, 0xfb2fd3c65e562fd5 +data8 0xfb576c5762024805, 0xfb7f1debc22c4040 +data8 0xfba6e89f32d0190a, 0xfbe2c803a0894893 +data8 0xfc0ad1ff0ed9ecf0, 0xfc32f57bdfbcbe7f +data8 0xfc5b32968f99b21c, 0xfc83896bc861ab08 +data8 0xfcabfa1861ed4815, 0xfce8d3cea7d3163e +data8 0xfd118595143ee273, 0xfd3a519943d4865a +data8 0xfd6337f8e1ae5a4b, 0xfd8c38d1c8e927eb +data8 0xfdb5544205095a53, 0xfdde8a67d2613531 +data8 0xfe07db619e781611, 0xfe460768d80bf758 +data8 0xfe6f9bfb06cd32f6, 0xfe994bcd3d14fcc2 +data8 0xfec316fecaf3f2ab, 0xfeecfdaf33fadb80 +data8 0xff16fffe2fa8fad6, 0xff411e0ba9db886d +data8 0xff6b57f7c33e4e9a, 0xff95ade2d1bd7358 +data8 0xffc01fed60f86fb5, 0xffeaae3832b63956 +ASM_SIZE_DIRECTIVE(T_table) + + + + + +D_table: +ASM_TYPE_DIRECTIVE(D_table,@object) +data4 0x1e50f488, 0x1ebdc559, 0x1e649ec1, 0x9eed9b2c +data4 0x9e511c44, 0x9ec6d551, 0x9eefe248, 0x9e313854 +data4 0x9f54ff18, 0x9d231411, 0x1ee5d63c, 0x9edf6b95 +data4 0x9f332aaa, 0x1dc92a84, 0x1f73fb7b, 0x1e32f100 +data4 0x9ea636f5, 0x9f6c3353, 0x9f405552, 0x1f33fd97 +data4 0x1e975291, 0x9e59a11e, 0x1e47b0ba, 0x9d8ad33e +data4 0x1ea51bf6, 0x1f25d782, 0x9ecf534d, 0x1f55436f +data4 0x1d0975e4, 0x9f0633a1, 0x1f3e840a, 0x1f523a4c +data4 0x9f53cbbc, 0x9c8b5661, 0x9f6bc8eb, 0x1f4f6c7b +data4 0x9ed9b376, 0x9f5b30b6, 0x1f64fa5e, 0x1cbcc3e0 +data4 0x1f343548, 0x1f62a6a2, 0x9f336abb, 0x9f1d15af +data4 0x1f476c83, 0x1ea86421, 0x1f33b2cf, 0x9e8f1348 +data4 0x1f6fa829, 0x9f30ee3a, 0x9ebd6146, 0x1f2db598 +data4 0x1ef9600d, 0x1f5b1427, 0x9edd741b, 0x1f51ef4e +data4 0x9f1aa57d, 0x9ee9b5e0, 0x9f17ecd7, 0x1ead71ff +data4 0x1f6c910e, 0x9e1837df, 0x9f0f17d9, 0x9e8350dd +data4 0x9d292f1b, 0x9e33b3ab, 0x9d6f0fe8, 0x9ed8c7cc +data4 0x9ec598c8, 0x9d56758c, 0x1e090c1e, 0x9ed4b941 +data4 0x9f1fc4cf, 0x1f63513a, 0x9edd0abc, 0x1e3924dd +data4 0x1f60d56f, 0x1ea84424, 0x9e88f4fb, 0x1f205c09 +data4 0x1ec9ae4e, 0x1d2d5738, 0x9f2c9f6d, 0x1e0765c2 +data4 0x1e8bbdd7, 0x9f16d9f1, 0x9ea62627, 0x1f13904c +data4 0x1e566ab8, 0x9dca3d1a, 0x9e91f2a1, 0x9f14641c +data4 0x9f278946, 0x1f490c1e, 0x1f575eb6, 0x1f50b3fd +data4 0x9da32efb, 0x1ea95e59, 0x9e41e058, 0x9eada15f +data4 0x9e4fe66c, 0x1f3abc98, 0x1f1b8d1e, 0x9ece97e4 +data4 0x1d188aed, 0x9e89b6ee, 0x1f287478, 0x9e8a161a +data4 0x1e4749f7, 0x9e68084a, 0x1e867f33, 0x9f462b63 +data4 0x1db30792, 0x1f59a767, 0x9d1da4ae, 0x9f472a33 +data4 0x1d1e91cd, 0x9f414824, 0x9f473d4f, 0x1f4b5783 +data4 0x9f5b04b8, 0x9f5c205b, 0x1f309617, 0x9f0d6852 +data4 0x9d96a609, 0x9f0965c2, 0x9e23f467, 0x9f089884 +data4 0x9ec71458, 0x9ed6e955, 0x1e5e8691, 0x1f5b2bbc +data4 0x9f128268, 0x1ed40f5b, 0x1dc430ce, 0x1f345986 +data4 0x1d778f72, 0x1e9b11d6, 0x9f5a40be, 0x9e07f61a +data4 0x9ed641a7, 0x9f334787, 0x1e952fd0, 0x1edeb5e2 +data4 0x9e9f3eb1, 0x9e379fd9, 0x1f13102a, 0x9e5e80e1 +data4 0x1c757944, 0x1dae2260, 0x1f183ab7, 0x1e55d576 +data4 0x9e6bb99f, 0x9f52d7cb, 0x9e73a0f5, 0x1d4e1d14 +data4 0x9dd05b53, 0x1f2261e4, 0x9d4ee73d, 0x1ede515e +data4 0x1f22a573, 0x9ecac348, 0x1e6a2ac0, 0x1e2787d2 +data4 0x9eb64b87, 0x1f0c69c6, 0x9f470a01, 0x9d7c1686 +data4 0x1e468ebe, 0x9f21ee2f, 0x9ee52116, 0x9e20f715 +data4 0x1ed18533, 0x9f005b38, 0x9f20cb95, 0x1da72967 +data4 0x1f1ba5d7, 0x1e2f8b16, 0x9c794f96, 0x9ca74ea3 +data4 0x1f410555, 0x9eff2b96, 0x1ce8f0b1, 0x1f0cee77 +data4 0x1f191edd, 0x9ed5fcbc, 0x1f30f242, 0x9e0ad369 +data4 0x1ed8f3c8, 0x1f52bb0e, 0x9e9ce408, 0x1f18907f +data4 0x9ecdad40, 0x9e8af91d, 0x1d46698a, 0x9f4b93d6 +data4 0x9f3f5d33, 0x1e2e52f7, 0x9f13aeec, 0x9f3b1969 +data4 0x1f0996f4, 0x9f2a03df, 0x1e264767, 0x1f3ab1fb +data4 0x9f3193c9, 0x9f21ce22, 0x9eab624c, 0x9ecd8fb1 +data4 0x1eaf9a85, 0x1f0c6a2c, 0x1eecbe61, 0x1f3fead9 +data4 0x1f1d3a29, 0x1e9099ce, 0x1eadd875, 0x1e4dbfb8 +data4 0x9dc640d2, 0x1f413680, 0x9f3f57b3, 0x1dfa1553 +data4 0x1ec71c6b, 0x1e00cc00, 0x9f271e55, 0x1e5a88bb +data4 0x1f46cc2b, 0x1ee80ff9, 0x9e29c6f3, 0x1f15e229 +data4 0x9ea83d66, 0x1f37408e, 0x9dacb66e, 0x1e6f6259 +data4 0x9f106973, 0x1dd4e5ac, 0x1cbfdcc8, 0x9f231c9f +data4 0x9e8677e4, 0x9e9e695a, 0x1efd782b, 0x9dd26959 +data4 0x9e80af69, 0x1f386fb3, 0x1f022e8c, 0x9e839967 +data4 0x1ce6796f, 0x1e4c22c2, 0x1e57ef24, 0x1e919804 +data4 0x9d7ea090, 0x1e40140a, 0x1f261b46, 0x1db75be2 +data4 0x1f145019, 0x9e3102b9, 0x9e22507b, 0x1eae813c +data4 0x1f117e97, 0x1f282296, 0x1f3814b3, 0x1e17977b +data4 0x1f39d6ff, 0x9f1c81b9, 0x9eb5bcad, 0x1f0f596e +data4 0x1e757fd5, 0x9f090daa, 0x9f2532fc, 0x9eebafbb +data4 0x1f086556, 0x9eeedde8, 0x9f32e174, 0x1e33c030 +data4 0x1f1f145a, 0x1e6e556c, 0x1e419ffb, 0x9eb6019a +data4 0x9e872a2e, 0x1e113136, 0x1e93096f, 0x1f39be40 +data4 0x1f1665ad, 0x9db81d7d, 0x9cd29091, 0x1e3f4af7 +data4 0x9f23176c, 0x9eccf9b3, 0x1f34fc6c, 0x9ed36894 +data4 0x1ef08e06, 0x9f3b46bb, 0x9f2c850b, 0x1f1565a4 +data4 0x1e887bc3, 0x1e92629c, 0x9f11ac9e, 0x9e5579f3 +data4 0x1e4d5790, 0x9ee1c3d1, 0x9e916aec, 0x9eb8d9b8 +data4 0x1db46105, 0x1e168663, 0x1f26a942, 0x9f0f0383 +data4 0x9f079032, 0x9ecae1d8, 0x1ed3b34c, 0x9edc5ee6 +data4 0x9e8a75a7, 0x1f3c3de2, 0x9ee5041e, 0x1f08c727 +data4 0x1d02d7ae, 0x9f36adda, 0x9ef9a857, 0x9ef5cb3a +data4 0x9eee73da, 0x9da5d629, 0x1e0e99be, 0x1e5159b9 +data4 0x1f2eac89, 0x9e8eedc5, 0x1dd0ec90, 0x1f229aff +data4 0x1ed9c3e6, 0x1e95c55a, 0x9f0c24e4, 0x1e8afed6 +data4 0x1e599a96, 0x1e881b21, 0x1eab84b9, 0x9ba2bb0e +data4 0x9e33ab10, 0x1f1710b5, 0x1ebfa271, 0x9e90bbc5 +data4 0x9f32515b, 0x9b32aae8, 0x1eda455c, 0x1da8186e +data4 0x9e8917ff, 0x1ec4d08e, 0x1c90069d, 0x9f2f1d29 +data4 0x9ecee86d, 0x9f234d1f, 0x1f370724, 0x1da87496 +data4 0x1e7959f0, 0x9e8ada34, 0x1f1c7f6f, 0x1edd576b +data4 0x9de91e8b, 0x1ec4ef89, 0x1f32078a, 0x1e9925e2 +data4 0x9d8eeccb, 0x9ea3d011, 0x1f231fdf, 0x9f1dbdfa +data4 0x1e7507a3, 0x1ec42614, 0x9e8693cb, 0x9ec68398 +data4 0x1d5b05fb, 0x1de32119, 0x9f003429, 0x9ec16d92 +data4 0x9f095315, 0x9f119d2c, 0x9ed0c984, 0x9f090662 +data4 0x9e59aa1f, 0x9ed4e64a, 0x9f2798a7, 0x9f23624d +data4 0x1e0467d9, 0x1f22e7e7, 0x1e915256, 0x9cb4df70 +data4 0x9e6f687c, 0x9e3c35e5, 0x9e5757ab, 0x9f031fa1 +data4 0x1f25bff7, 0x1f0e58c2, 0x1ef3ce04, 0x1f002ecb +data4 0x9ebdc836, 0x9ed657dd, 0x9f149441, 0x9e8544b2 +data4 0x1cd8ff1e, 0x1e9bb463, 0x1eaa1c5c, 0x1f200c1a +data4 0x1edbfbaf, 0x1f18724d, 0x9ed63c22, 0x9f08e045 +data4 0x1f13ad07, 0x9e949311, 0x9f0c50d4, 0x1e824516 +data4 0x1d5e52ba, 0x1d583fbd, 0x1e3b60a9, 0x9effe6d3 +data4 0x1f0d0508, 0x1f00be77, 0x9e404bfa, 0x9e1ca381 +data4 0x9f084dd8, 0x9e6db85d, 0x1db698e4, 0x9ebd1871 +data4 0x9ecc2679, 0x1ee68442, 0x1edb1050, 0x9dbc96a4 +data4 0x9f27c1f4, 0x1c99b756, 0x1eb4400a, 0x9f24390a +data4 0x1d927875, 0x9f074faa, 0x1e9dc2c3, 0x1f13c0d2 +data4 0x1e3c9685, 0x9e6b6f75, 0x9db9cb31, 0x1ea5f3aa +data4 0x9d992c61, 0x1f1015e4, 0x1f194f70, 0x9e19d2b3 +data4 0x9d89116c, 0x1f23cd35, 0x1e33d3a2, 0x1ee331b8 +data4 0x1d5ba7ec, 0x9f273788, 0x9e6907f4, 0x9ed5f912 +data4 0x9edd458d, 0x1e2ca7b2, 0x1ef81fe4, 0x1dc7ade6 +data4 0x1e876e51, 0x9f04ec89, 0x1f1da63a, 0x1ec02bd0 +data4 0x9e71326f, 0x1e7847b4, 0x1f0de618, 0x9e036cb6 +data4 0x1eec61e2, 0x1ef1758b, 0x9ee880a3, 0x1ed269d7 +data4 0x1e27edd3, 0x9e8a81a1, 0x1eacb84d, 0x9e1aad37 +data4 0x1f1aa8f7, 0x1e9bbd90, 0x1ea1b61f, 0x9ed41c2f +data4 0x1dbb5dd6, 0x1f0ec733, 0x9df06b1b, 0x1e06fef1 +data4 0x9edede3a, 0x1edeb5e2, 0x1f0e63ee, 0x9db316bb +data4 0x9efc1ad3, 0x1f01fbb5, 0x9cc0d078, 0x1ea28b36 +data4 0x9e9dd205, 0x9e791534, 0x1da1c8d5, 0x9e8195cc +data4 0x1f0681a4, 0x1eeaf1e2, 0x9ef83b37, 0x9f22a92b +data4 0x1eabc4ce, 0x1f10eefb, 0x1e06d9aa, 0x1e7cacd5 +data4 0x1f1ea087, 0x1eb21983, 0x9f100c78, 0x1e840abe +data4 0x9efab66c, 0x1f183fa8, 0x9e84ee68, 0x9eea083d +data4 0x9ee23a74, 0x1f1351d7, 0x9ec5d42a, 0x9f071f57 +data4 0x9ef578d9, 0x9f1aa7e7, 0x1eb02044, 0x1f151a2e +data4 0x9c0dc8b2, 0x9ef4087a, 0x1ec12b93, 0x1c1a946b +data4 0x1e89946f, 0x9dafe8c3, 0x1d295288, 0x9e8497ab +data4 0x1ec000c6, 0x1e102f29, 0x1e542256, 0x1e67d44d +data4 0x1ef688d8, 0x1f0e0f29, 0x1e67861f, 0x1e869748 +data4 0x1ee6aa6e, 0x9e4d228b, 0x9e50be5b, 0x1e9fe225 +data4 0x9ea34102, 0x9e628a3b, 0x9ed9fd83, 0x1ecd7109 +data4 0x1f1864ff, 0x1ea19b76, 0x1db0d1c9, 0x9dff519b +data4 0x1e8fea71, 0x9ee82e9a, 0x9f08919b, 0x9ef5c8ae +data4 0x9ee446a4, 0x1ea59444, 0x1eb74230, 0x1ea13fbf +data4 0x9ea6a3ea, 0x1e5f2797, 0x9e0adb07, 0x9d3adadd +data4 0x1ebf2ee2, 0x1da19bfa, 0x1e8dea6d, 0x1ec4fea9 +data4 0x1e669f22, 0x1dc5f919, 0x9ed25caa, 0x1ee475b1 +data4 0x1ed0603e, 0x9eacb35c, 0x1dc00b27, 0x1e2f9991 +data4 0x1e7b0406, 0x1eaa3387, 0x9d865bde, 0x1eb78a48 +data4 0x1c40ae2e, 0x1ee9838b, 0x9f0f0d7f, 0x1e3e5d26 +data4 0x1e99e7a6, 0x9e681ccf, 0x9e93ed65, 0x9eeb6a66 +data4 0x1e29e9af, 0x9e96f923, 0x9e74f11d, 0x9f1474da +data4 0x1eec2ea7, 0x1ebf7aa3, 0x9c25dcca, 0x9f0553c2 +data4 0x9e599efd, 0x1d2ab490, 0x1e95d7cd, 0x9ee4b20e +data4 0x9d988ce5, 0x9ef9787e, 0x9dbbba5b, 0x9f12c304 +data4 0x1e3b9d70, 0x1e7bcae8, 0x9d98bb6e, 0x9e8e6b01 +data4 0x9f07d03b, 0x9d67c822, 0x9f0ef69e, 0x1c7c0fe3 +data4 0x9e9bfbb9, 0x9e83b84b, 0x1efbf15e, 0x9ecfa6a6 +data4 0x9c91158e, 0x9ecf6770, 0x1ee1e3a8, 0x9dc95ec0 +data4 0x1ef603f7, 0x1d5e52ba, 0x1c477d1b, 0x9e955cd8 +data4 0x1ed665b0, 0x9e8376c4, 0x9c0ee88e, 0x1e8c989e +data4 0x1ea2df29, 0x9d961e5c, 0x1e101813, 0x1e7fffff +data4 0x9e5abff4, 0x1dbddd71, 0x1eb69100, 0x1e71f114 +data4 0x1e9ca798, 0x1ef62c8d, 0x9db4e55a, 0x1dbe69ce +data4 0x9ef1c01f, 0x1f044a2a, 0x9eb9e0d7, 0x9ee59745 +data4 0x9e874803, 0x1ea0b418, 0x9e13572a, 0x1ddbb3a2 +data4 0x9ec0e391, 0x1e89fba1, 0x1ee8b261, 0x9e5d25f0 +data4 0x9ef222cb, 0x9ef135ec, 0x1ea04b9a, 0x9f04291f +data4 0x9e969254, 0x9ee32f08, 0x9ed909d3, 0x9e362640 +data4 0x9ec20735, 0x1e50131b, 0x9ed4e049, 0x1ee8e817 +data4 0x1e1e09c0, 0x9ea643c5, 0x9e5a1ab6, 0x9e389059 +data4 0x1e560947, 0x1d02b877, 0x1e4475ab, 0x9ea9aaf6 +data4 0x1e95bc5e, 0x1eaf6afd, 0x1d43067d, 0x9d043821 +data4 0x9e97baa9, 0x1de5c4f9, 0x9e9a0069, 0x9e1b9944 +data4 0x1eb13686, 0x9eb907eb, 0x1e059589, 0x1cbd0f93 +data4 0x9eb7e6ae, 0x1e9fa175, 0x1ee5bdf4, 0x1e8052f7 +data4 0x9c80d1e3, 0x1bfbe28e, 0x9e672b3b, 0x9ecacf19 +data4 0x9e3c04be, 0x1dfe8c5c, 0x1e1ba9cb, 0x1eb40b1e +data4 0x1ec7e7f6, 0x9d0d45b3, 0x1ef0113b, 0x9a155fa3 +data4 0x1e28ec3b, 0x1e7ca8df, 0x9d2f91b4, 0x1eccd9ed +data4 0x9ed943bc, 0x9ccaab19, 0x9e8a5c58, 0x1ec3bca8 +data4 0x1ed78dc7, 0x9ed391a8, 0x9e938f6e, 0x9ec4a030 +data4 0x9e80346e, 0x1e7a4686, 0x9e284315, 0x9e39584c +data4 0x1ebdc9b4, 0x9e9cfce5, 0x9ef55c65, 0x1e2941e7 +data4 0x9efbe59f, 0x1d87c41b, 0x1e40befc, 0x1e3d05b5 +data4 0x1de9ea67, 0x1ec9a21c, 0x1decb69a, 0x1df6e75a +data4 0x9e8030ab, 0x9db20540, 0x9ef1e977, 0x1e3cdc43 +data4 0x1e0492b0, 0x9e91d872, 0x1e775346, 0x9e939978 +data4 0x1eb2714e, 0x1e49a203, 0x9e10195a, 0x1ef1ffc3 +data4 0x9ea8b709, 0x9e832e27, 0x1ed5ac3b, 0x1edb20a6 +data4 0x1e4dbd4e, 0x1efbb932, 0x1d8170ec, 0x1e6c4849 +data4 0x1f008e17, 0x1e8000c4, 0x1d855ecf, 0x9e37cb85 +data4 0x1ecffdf5, 0x1eba6519, 0x9edbe600, 0x1ea3e5e7 +data4 0x1ed4fb39, 0x1f00be77, 0x1e6f4484, 0x9e9e7107 +data4 0x9e30b29d, 0x9ee6e174, 0x1e3a2656, 0x9dd72f3f +data4 0x9ee12138, 0x1ed16fed, 0x9ece8a02, 0x9ca5b249 +data4 0x9eafd508, 0x9ef0e9fc, 0x1d1307ac, 0x1eecee20 +data4 0x1cf60c6f, 0x9d556216, 0x9eaed175, 0x9ec919f4 +data4 0x1ec2c988, 0x1cd82772, 0x9dc99456, 0x1eab0467 +data4 0x1e89b36f, 0x1c757944, 0x1eef9abd, 0x9e98664d +ASM_SIZE_DIRECTIVE(D_table) + + + + + +.align 32 +.global cbrtl# + +.section .text +.proc cbrtl# +.align 32 +cbrtl: + + +{ .mfi + getf.sig r3=f8 + // will continue only for normal/denormal numbers +(p0) fclass.nm.unc p12,p7 = f8, 0x1b + // r2 = pointer to C_1...C_6 followed by T_table + addl r2 = @ltoff(poly_coeffs), gp;; +} +{.mfi + // r29=2/3*bias -63=0xaaaa-0x3f=0xaa6b + mov r29=0xaa6b + // normalize a + fma.s1 f14=f8,f1,f0 + // r27 = pointer to D table + addl r27 = @ltoff(D_table), gp;; +} +{.mib + nop.m 0 + (p7) cmp.eq p12,p0=r3,r0 + nop.b 0;; +} +{.mfb + // load start address for C_1...C_6 followed by T_table + ld8 r2=[r2] + (p12) fma.s0 f8=f8,f1,f0 + (p12) br.ret.spnt b0;; +} +{.mmf + // load C_1 + ldfe f7=[r2],16 + // load start address of D table + ld8 r27=[r27] + // y=frcpa(a) + frcpa.s0 f8,p6=f1,f8;; +} +{.mmi + // load C_2 + ldfe f9=[r2],16;; + // load C_3, C_4 + ldfpd f10,f11=[r2],16 + nop.i 0;; +} +{.mmi + // get normalized significand + getf.sig r23=f14 + // get exponent + getf.exp r24=f14 + mov r25=0x20000;; +} +{.mii + // get r26=sign + and r26=r24,r25 + // eliminate leading 1 from r23=2nd table index + shl r23=r23,1 + // eliminate sign from exponent (r25) + andcm r25=r24,r25;; +} +{.mfi + // load C_5,C_6 + (p6) ldfpd f12,f13=[r2],16 + // r=1-a*y + (p6) fnma.s1 f6=f8,f14,f1 + // 1: exponent*=5; // (2^{16}-1)/3=0x5555 + shladd r24=r25,2,r25;; +} +{.mib + // r30=(5*expon)*16 + shladd r30=r24,4,r0 + // r28=3*exponent + shladd r28=r25,1,r25 + nop.b 0;; +} +{.mmi + // r28=6*exponent + shladd r28=r28,1,r0 + // r24=17*expon + add r24=r24,r30 + // r23=2nd table index (8 bits) + shr.u r23=r23,56;; +} +{.mmi + // adjust T_table pointer by 2nd index + shladd r2=r23,3,r2 + // adjust D_table pointer by 2nd index + shladd r27=r23,2,r27 + // r30=(17*expon)*16^2 + shl r30=r24,8;; +} +{.mmi + // r24=expon*(2^16-1)/3 + add r24=r24,r30;; + // r24=expon*(2^20+2)/3=expon*0x55556 + shladd r24=r24,4,r28 + nop.i 0;; +} +{.mii + nop.m 0 + // r24=floor(expon/3) + shr.u r24=r24,20 + nop.i 0;; +} +{.mmi + nop.m 0 + // r28=3*exponent + shladd r28=r24,1,r24 + // bias exponent + add r24=r29,r24;; +} +{.mmi + // get remainder of exponent/3 + sub r25=r25,r28;; + // add sign to exponent + or r24=r24,r26 + // remainder <<=8 + shl r25=r25,8;; +} +{.mfi + // adjust D_table pointer by 1st index + shladd r27=r25,2,r27 + // P_1=C_1+C_2*r + (p6) fma.s1 f7=f9,f6,f7 + // adjust T_table pointer by 1st index + shladd r2=r25,3,r2 +} +{.mfi + // f14=sign*2^{exponent/3} + (p6) setf.exp f14=r24 + // r2=r*r + (p6) fma.s1 f9=f6,f6,f0 + nop.i 0;; +} +{.mfi + // load D + (p6) ldfs f15=[r27] + // P_2=C_3+C_4*r + (p6) fma.s1 f10=f11,f6,f10 + nop.i 0 +} +{.mfi + // load T + (p6) ldf8 f8=[r2] + // P_3=C_5+C_6*r + (p6) fma.s1 f12=f13,f6,f12 + nop.i 0;; +} +{.mfi + nop.m 0 + // P_4=D-r*P_1 + (p6) fnma.s1 f15=f6,f7,f15 + nop.i 0 +} +{.mfi + nop.m 0 + // r3=r*r2 + (p6) fma.s1 f6=f6,f9,f0 + nop.i 0;; +} +{.mfi + nop.m 0 + // P_5=P_2+r2*P_3 + (p6) fma.s1 f10=f9,f12,f10 + nop.i 0;; +} +{.mfi + nop.m 0 + // T=T*(sign*2^{exponent/3}) + (p6) fma.s1 f8=f8,f14,f0 + nop.i 0 +} +{.mfi + nop.m 0 + // P=P_4-r3*P_5 + (p6) fnma.s1 f6=f6,f10,f15 + nop.i 0;; +} +{.mfb + nop.m 0 + // result=T+T*p + (p6) fma.s0 f8=f8,f6,f8 + br.ret.sptk b0;; +} +.endp cbrtl +ASM_SIZE_DIRECTIVE(cbrtl) diff --git a/sysdeps/ia64/fpu/s_ceil.S b/sysdeps/ia64/fpu/s_ceil.S new file mode 100644 index 0000000000..58057c8fde --- /dev/null +++ b/sysdeps/ia64/fpu/s_ceil.S @@ -0,0 +1,249 @@ +.file "ceil.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// + +#include "libm_support.h" + +.align 32 +.global ceil# + +.section .text +.proc ceil# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 6/13/00: Improved speed +// 6/27/00: Eliminated incorrect invalid flag setting + +// API +//============================================================== +// double ceil(double x) + +// general input registers: + +ceil_GR_FFFF = r14 +ceil_GR_signexp = r15 +ceil_GR_exponent = r16 +ceil_GR_expmask = r17 +ceil_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set +// p14 ==> Input is between -1 and 0, so result will be -0 and inexact + + +// floating-point registers used: + +CEIL_SIGNED_ZERO = f7 +CEIL_NORM_f8 = f9 +CEIL_FFFF = f10 +CEIL_INEXACT = f11 +CEIL_FLOAT_INT_f8 = f12 +CEIL_INT_f8 = f13 +CEIL_adj = f14 +CEIL_MINUS_ONE = f15 + +// Overview of operation +//============================================================== + +// double ceil(double x) +// Return an integer value (represented as a double) that is the smallest +// value not less than x +// This is x rounded toward +infinity to an integral value. +// Inexact is set if x != ceil(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +ceil: + +{ .mfi + getf.exp ceil_GR_signexp = f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 + addl ceil_GR_bigexp = 0x10033, r0 +} +{ .mfi + addl ceil_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov ceil_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig CEIL_FFFF = ceil_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm CEIL_NORM_f8 = f8 + nop.i 999 ;; +} + +// Form 0 with sign of input in case negative zero is needed +{ .mfi + nop.m 999 + fmerge.s CEIL_SIGNED_ZERO = f8, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fsub.s1 CEIL_MINUS_ONE = f0, f1 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(CEIL_DENORM) ;; +} + +L(CEIL_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to add to trunc(x) for result +// If x>0, adjustment is 1.0 +// If x<=0, adjustment is 0.0 +{ .mfi + and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask +(p9) fadd.s1 CEIL_adj = f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fadd.s1 CEIL_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi +(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp +(p6) fnorm.d f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm.d f8 = CEIL_NORM_f8 + nop.i 999 ;; +} + +// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +{ .mfi + nop.m 999 +(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE + nop.i 999 ;; +} + +{ .mfi +(p14) cmp.ne p11,p0 = r0,r0 +(p14) fnorm.d f8 = CEIL_SIGNED_ZERO + nop.i 999 +} +{ .mfi + nop.m 999 +(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd.d f8 = CEIL_FLOAT_INT_f8,CEIL_adj + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm.d f8 = CEIL_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(CEIL_DENORM): +{ .mfb + getf.exp ceil_GR_signexp = CEIL_NORM_f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 + br.cond.sptk L(CEIL_COMMON) ;; +} + +.endp ceil +ASM_SIZE_DIRECTIVE(ceil) diff --git a/sysdeps/ia64/fpu/s_ceilf.S b/sysdeps/ia64/fpu/s_ceilf.S new file mode 100644 index 0000000000..2636e85deb --- /dev/null +++ b/sysdeps/ia64/fpu/s_ceilf.S @@ -0,0 +1,249 @@ +.file "ceilf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// + +#include "libm_support.h" + +.align 32 +.global ceilf# + +.section .text +.proc ceilf# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 6/13/00: Improved speed +// 6/27/00: Eliminated incorrect invalid flag setting + +// API +//============================================================== +// float ceilf(float x) + +// general input registers: + +ceil_GR_FFFF = r14 +ceil_GR_signexp = r15 +ceil_GR_exponent = r16 +ceil_GR_expmask = r17 +ceil_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set +// p14 ==> Input is between -1 and 0, so result will be -0 and inexact + + +// floating-point registers used: + +CEIL_SIGNED_ZERO = f7 +CEIL_NORM_f8 = f9 +CEIL_FFFF = f10 +CEIL_INEXACT = f11 +CEIL_FLOAT_INT_f8 = f12 +CEIL_INT_f8 = f13 +CEIL_adj = f14 +CEIL_MINUS_ONE = f15 + +// Overview of operation +//============================================================== + +// float ceilf(float x) +// Return an integer value (represented as a float) that is the smallest +// value not less than x +// This is x rounded toward +infinity to an integral value. +// Inexact is set if x != ceilf(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +ceilf: + +{ .mfi + getf.exp ceil_GR_signexp = f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 + addl ceil_GR_bigexp = 0x10016, r0 +} +{ .mfi + addl ceil_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov ceil_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig CEIL_FFFF = ceil_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm CEIL_NORM_f8 = f8 + nop.i 999 ;; +} + +// Form 0 with sign of input in case negative zero is needed +{ .mfi + nop.m 999 + fmerge.s CEIL_SIGNED_ZERO = f8, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fsub.s1 CEIL_MINUS_ONE = f0, f1 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(CEIL_DENORM) ;; +} + +L(CEIL_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to add to trunc(x) for result +// If x>0, adjustment is 1.0 +// If x<=0, adjustment is 0.0 +{ .mfi + and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask +(p9) fadd.s1 CEIL_adj = f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fadd.s1 CEIL_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi +(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp +(p6) fnorm.s f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm.s f8 = CEIL_NORM_f8 + nop.i 999 ;; +} + +// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +{ .mfi + nop.m 999 +(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE + nop.i 999 ;; +} + +{ .mfi +(p14) cmp.ne p11,p0 = r0,r0 +(p14) fnorm.s f8 = CEIL_SIGNED_ZERO + nop.i 999 +} +{ .mfi + nop.m 999 +(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd.s f8 = CEIL_FLOAT_INT_f8,CEIL_adj + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm.s f8 = CEIL_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(CEIL_DENORM): +{ .mfb + getf.exp ceil_GR_signexp = CEIL_NORM_f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 + br.cond.sptk L(CEIL_COMMON) ;; +} + +.endp ceilf +ASM_SIZE_DIRECTIVE(ceilf) diff --git a/sysdeps/ia64/fpu/s_ceill.S b/sysdeps/ia64/fpu/s_ceill.S new file mode 100644 index 0000000000..443ae92a3c --- /dev/null +++ b/sysdeps/ia64/fpu/s_ceill.S @@ -0,0 +1,249 @@ +.file "ceill.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// + +#include "libm_support.h" + +.align 32 +.global ceill# + +.section .text +.proc ceill# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 6/13/00: Improved speed +// 6/27/00: Eliminated incorrect invalid flag setting + +// API +//============================================================== +// double ceill(double x) + +// general input registers: + +ceil_GR_FFFF = r14 +ceil_GR_signexp = r15 +ceil_GR_exponent = r16 +ceil_GR_expmask = r17 +ceil_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set +// p14 ==> Input is between -1 and 0, so result will be -0 and inexact + + +// floating-point registers used: + +CEIL_SIGNED_ZERO = f7 +CEIL_NORM_f8 = f9 +CEIL_FFFF = f10 +CEIL_INEXACT = f11 +CEIL_FLOAT_INT_f8 = f12 +CEIL_INT_f8 = f13 +CEIL_adj = f14 +CEIL_MINUS_ONE = f15 + +// Overview of operation +//============================================================== + +// long double ceill(long double x) +// Return an integer value (represented as a long double) that is the smallest +// value not less than x +// This is x rounded toward +infinity to an integral value. +// Inexact is set if x != ceill(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +ceill: + +{ .mfi + getf.exp ceil_GR_signexp = f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 + addl ceil_GR_bigexp = 0x1003e, r0 +} +{ .mfi + addl ceil_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov ceil_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig CEIL_FFFF = ceil_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm CEIL_NORM_f8 = f8 + nop.i 999 ;; +} + +// Form 0 with sign of input in case negative zero is needed +{ .mfi + nop.m 999 + fmerge.s CEIL_SIGNED_ZERO = f8, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fsub.s1 CEIL_MINUS_ONE = f0, f1 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(CEIL_DENORM) ;; +} + +L(CEIL_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to add to trunc(x) for result +// If x>0, adjustment is 1.0 +// If x<=0, adjustment is 0.0 +{ .mfi + and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask +(p9) fadd.s1 CEIL_adj = f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fadd.s1 CEIL_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi +(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp +(p6) fnorm f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm f8 = CEIL_NORM_f8 + nop.i 999 ;; +} + +// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +{ .mfi + nop.m 999 +(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE + nop.i 999 ;; +} + +{ .mfi +(p14) cmp.ne p11,p0 = r0,r0 +(p14) fnorm f8 = CEIL_SIGNED_ZERO + nop.i 999 +} +{ .mfi + nop.m 999 +(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd f8 = CEIL_FLOAT_INT_f8,CEIL_adj + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm f8 = CEIL_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(CEIL_DENORM): +{ .mfb + getf.exp ceil_GR_signexp = CEIL_NORM_f8 + fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 + br.cond.sptk L(CEIL_COMMON) ;; +} + +.endp ceill +ASM_SIZE_DIRECTIVE(ceill) diff --git a/sysdeps/ia64/fpu/s_cos.S b/sysdeps/ia64/fpu/s_cos.S new file mode 100644 index 0000000000..cd715b4d22 --- /dev/null +++ b/sysdeps/ia64/fpu/s_cos.S @@ -0,0 +1,3488 @@ +.file "sincos.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial revision +// 4/02/00 Unwind support added. +// 6/16/00 Updated tables to enforce symmetry +// 8/31/00 Saved 2 cycles in main path, and 9 in other paths. +// 9/20/00 The updated tables regressed to an old version, so reinstated them +// 10/18/00 Changed one table entry to ensure symmetry +// 1/03/01 Improved speed, fixed flag settings for small arguments. + +// API +//============================================================== +// double sin( double x); +// double cos( double x); +// +// Overview of operation +//============================================================== +// +// Step 1 +// ====== +// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4 +// divide x by pi/2^k. +// Multiply by 2^k/pi. +// nfloat = Round result to integer (round-to-nearest) +// +// r = x - nfloat * pi/2^k +// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) for increased accuracy. +// pi/2^k is stored as two numbers that when added make pi/2^k. +// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k) +// +// x = (nfloat * pi/2^k) + r +// r is small enough that we can use a polynomial approximation +// and is referred to as the reduced argument. +// +// Step 3 +// ====== +// Take the unreduced part and remove the multiples of 2pi. +// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits +// +// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1) +// N * 2^(k+1) +// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k +// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k +// nfloat * pi/2^k = N2pi + M * pi/2^k +// +// +// Sin(x) = Sin((nfloat * pi/2^k) + r) +// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r) +// +// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k) +// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k) +// = Sin(Mpi/2^k) +// +// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k) +// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k) +// = Cos(Mpi/2^k) +// +// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) +// +// +// Step 4 +// ====== +// 0 <= M < 2^(k+1) +// There are 2^(k+1) Sin entries in a table. +// There are 2^(k+1) Cos entries in a table. +// +// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup. +// +// +// Step 5 +// ====== +// Calculate Cos(r) and Sin(r) by polynomial approximation. +// +// Cos(r) = 1 + r^2 q1 + r^4 q2 + r^6 q3 + ... = Series for Cos +// Sin(r) = r + r^3 p1 + r^5 p2 + r^7 p3 + ... = Series for Sin +// +// and the coefficients q1, q2, ... and p1, p2, ... are stored in a table +// +// +// Calculate +// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) +// +// as follows +// +// Sm = Sin(Mpi/2^k) and Cm = Cos(Mpi/2^k) +// rsq = r*r +// +// +// P = p1 + r^2p2 + r^4p3 + r^6p4 +// Q = q1 + r^2q2 + r^4q3 + r^6q4 +// +// rcub = r * rsq +// Sin(r) = r + rcub * P +// = r + r^3p1 + r^5p2 + r^7p3 + r^9p4 + ... = Sin(r) +// +// The coefficients are not exactly these values, but almost. +// +// p1 = -1/6 = -1/3! +// p2 = 1/120 = 1/5! +// p3 = -1/5040 = -1/7! +// p4 = 1/362889 = 1/9! +// +// P = r + rcub * P +// +// Answer = Sm Cos(r) + Cm P +// +// Cos(r) = 1 + rsq Q +// Cos(r) = 1 + r^2 Q +// Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4) +// Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ... +// +// Sm Cos(r) = Sm(1 + rsq Q) +// Sm Cos(r) = Sm + Sm rsq Q +// Sm Cos(r) = Sm + s_rsq Q +// Q = Sm + s_rsq Q +// +// Then, +// +// Answer = Q + Cm P + +#include "libm_support.h" + +// Registers used +//============================================================== +// general input registers: +// r14 -> r19 +// r32 -> r45 + +// predicate registers used: +// p6 -> p14 + +// floating-point registers used +// f9 -> f15 +// f32 -> f61 + +// Assembly macros +//============================================================== +sind_NORM_f8 = f9 +sind_W = f10 +sind_int_Nfloat = f11 +sind_Nfloat = f12 + +sind_r = f13 +sind_rsq = f14 +sind_rcub = f15 + +sind_Inv_Pi_by_16 = f32 +sind_Pi_by_16_hi = f33 +sind_Pi_by_16_lo = f34 + +sind_Inv_Pi_by_64 = f35 +sind_Pi_by_64_hi = f36 +sind_Pi_by_64_lo = f37 + +sind_Sm = f38 +sind_Cm = f39 + +sind_P1 = f40 +sind_Q1 = f41 +sind_P2 = f42 +sind_Q2 = f43 +sind_P3 = f44 +sind_Q3 = f45 +sind_P4 = f46 +sind_Q4 = f47 + +sind_P_temp1 = f48 +sind_P_temp2 = f49 + +sind_Q_temp1 = f50 +sind_Q_temp2 = f51 + +sind_P = f52 +sind_Q = f53 + +sind_srsq = f54 + +sind_SIG_INV_PI_BY_16_2TO61 = f55 +sind_RSHF_2TO61 = f56 +sind_RSHF = f57 +sind_2TOM61 = f58 +sind_NFLOAT = f59 +sind_W_2TO61_RSH = f60 + +fp_tmp = f61 + +///////////////////////////////////////////////////////////// + +sind_AD_1 = r33 +sind_AD_2 = r34 +sind_exp_limit = r35 +sind_r_signexp = r36 +sind_AD_beta_table = r37 +sind_r_sincos = r38 + +sind_r_exp = r39 +sind_r_17_ones = r40 + +sind_GR_sig_inv_pi_by_16 = r14 +sind_GR_rshf_2to61 = r15 +sind_GR_rshf = r16 +sind_GR_exp_2tom61 = r17 +sind_GR_n = r18 +sind_GR_m = r19 +sind_GR_32m = r19 + +gr_tmp = r41 +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 +double_sind_pi: +ASM_TYPE_DIRECTIVE(double_sind_pi,@object) +// data8 0xA2F9836E4E44152A, 0x00004001 // 16/pi (significand loaded w/ setf) +// c90fdaa22168c234 + data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 hi +// c4c6628b80dc1cd1 29024e088a + data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 lo +ASM_SIZE_DIRECTIVE(double_sind_pi) + +double_sind_pq_k4: +ASM_TYPE_DIRECTIVE(double_sind_pq_k4,@object) + data8 0x3EC71C963717C63A // P4 + data8 0x3EF9FFBA8F191AE6 // Q4 + data8 0xBF2A01A00F4E11A8 // P3 + data8 0xBF56C16C05AC77BF // Q3 + data8 0x3F8111111110F167 // P2 + data8 0x3FA555555554DD45 // Q2 + data8 0xBFC5555555555555 // P1 + data8 0xBFDFFFFFFFFFFFFC // Q1 +ASM_SIZE_DIRECTIVE(double_sind_pq_k4) + + +double_sin_cos_beta_k4: +ASM_TYPE_DIRECTIVE(double_sin_cos_beta_k4,@object) +data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16) S0 +data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16) C0 + +data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16) S1 +data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16) C1 + +data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16) S2 +data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16) C2 + +data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16) S3 +data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16) C3 + +data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16) S4 +data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16) C4 + + +data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16) C3 +data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16) S3 + +data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16) C2 +data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16) S2 + +data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16) C1 +data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16) S1 + +data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16) C0 +data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16) S0 + + +data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16) C1 +data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16) -S1 + +data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16) C2 +data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16) -S2 + +data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16) C3 +data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16) -S3 + +data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16) S4 +data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16) -S4 + + +data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3 +data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3 + +data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2 +data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2 + +data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1 +data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1 + +data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0 +data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0 + + +data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1 +data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1 + +data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2 +data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2 + +data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3 +data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3 + +data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4 +data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4 + + +data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3 +data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3 + +data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2 +data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2 + +data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1 +data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1 + +data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0 +data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0 + + +data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1 +data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1 + +data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2 +data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2 + +data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3 +data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3 + +data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4 +data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4 + + +data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3 +data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3 + +data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2 +data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2 + +data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1 +data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1 + +data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0 +data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0 +ASM_SIZE_DIRECTIVE(double_sin_cos_beta_k4) + +.align 32 +.global sin# +.global cos# +#ifdef _LIBC +.global __sin# +.global __cos# +#endif + +//////////////////////////////////////////////////////// +// There are two entry points: sin and cos + + +// If from sin, p8 is true +// If from cos, p9 is true + +.section .text +.proc sin# +#ifdef _LIBC +.proc __sin# +#endif +.align 32 + +sin: +#ifdef _LIBC +__sin: +#endif + +{ .mlx + alloc r32=ar.pfs,1,13,0,0 + movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi +} +{ .mlx + addl sind_AD_1 = @ltoff(double_sind_pi), gp + movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2) +} +;; + +{ .mfi + ld8 sind_AD_1 = [sind_AD_1] + fnorm sind_NORM_f8 = f8 + cmp.eq p8,p9 = r0, r0 +} +{ .mib + mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61 + mov sind_r_sincos = 0x0 + br.cond.sptk L(SIND_SINCOS) +} +;; + +.endp sin +ASM_SIZE_DIRECTIVE(sin) + + +.section .text +.proc cos# +#ifdef _LIBC +.proc __cos# +#endif +.align 32 +cos: +#ifdef _LIBC +__cos: +#endif + +{ .mlx + alloc r32=ar.pfs,1,13,0,0 + movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi +} +{ .mlx + addl sind_AD_1 = @ltoff(double_sind_pi), gp + movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2) +} +;; + +{ .mfi + ld8 sind_AD_1 = [sind_AD_1] + fnorm.s1 sind_NORM_f8 = f8 + cmp.eq p9,p8 = r0, r0 +} +{ .mib + mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61 + mov sind_r_sincos = 0x8 + br.cond.sptk L(SIND_SINCOS) +} +;; + + +//////////////////////////////////////////////////////// +// All entry points end up here. +// If from sin, sind_r_sincos is 0 and p8 is true +// If from cos, sind_r_sincos is 8 = 2^(k-1) and p9 is true +// We add sind_r_sincos to N + +L(SIND_SINCOS): + + +// Form two constants we need +// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand +// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand +// fcmp used to set denormal, and invalid on snans +{ .mfi + setf.sig sind_SIG_INV_PI_BY_16_2TO61 = sind_GR_sig_inv_pi_by_16 + fcmp.eq.s0 p12,p0=f8,f0 + mov sind_r_17_ones = 0x1ffff +} +{ .mlx + setf.d sind_RSHF_2TO61 = sind_GR_rshf_2to61 + movl sind_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift +} +;; + +// Form another constant +// 2^-61 for scaling Nfloat +// 0x10009 is register_bias + 10. +// So if f8 > 2^10 = Gamma, go to DBX +{ .mfi + setf.exp sind_2TOM61 = sind_GR_exp_2tom61 + fclass.m p13,p0 = f8, 0x23 // Test for x inf + mov sind_exp_limit = 0x10009 +} +;; + +// Load the two pieces of pi/16 +// Form another constant +// 1.1000...000 * 2^63, the right shift constant +{ .mmf + ldfe sind_Pi_by_16_hi = [sind_AD_1],16 + setf.d sind_RSHF = sind_GR_rshf + fclass.m p14,p0 = f8, 0xc3 // Test for x nan +} +;; + +{ .mfi + ldfe sind_Pi_by_16_lo = [sind_AD_1],16 +(p13) frcpa.s0 f8,p12=f0,f0 // force qnan indef for x=inf + addl gr_tmp = -1,r0 +} +{ .mfb + addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp + nop.f 999 +(p13) br.ret.spnt b0 ;; // Exit for x=inf +} + +// Start loading P, Q coefficients +// SIN(0) +{ .mfi + ldfpd sind_P4,sind_Q4 = [sind_AD_1],16 +(p8) fclass.m.unc p6,p0 = f8, 0x07 // Test for sin(0) + nop.i 999 +} +{ .mfb + addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp +(p14) fma.d f8=f8,f1,f0 // qnan for x=nan +(p14) br.ret.spnt b0 ;; // Exit for x=nan +} + + +// COS(0) +{ .mfi + getf.exp sind_r_signexp = f8 +(p9) fclass.m.unc p7,p0 = f8, 0x07 // Test for sin(0) + nop.i 999 +} +{ .mfi + ld8 sind_AD_beta_table = [sind_AD_beta_table] + nop.f 999 + nop.i 999 ;; +} + +{ .mmb + ldfpd sind_P3,sind_Q3 = [sind_AD_1],16 + setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact +(p6) br.ret.spnt b0 ;; +} + +{ .mfb + and sind_r_exp = sind_r_17_ones, sind_r_signexp +(p7) fmerge.s f8 = f1,f1 +(p7) br.ret.spnt b0 ;; +} + +// p10 is true if we must call routines to handle larger arguments +// p10 is true if f8 exp is > 0x10009 + +{ .mfi + ldfpd sind_P2,sind_Q2 = [sind_AD_1],16 + nop.f 999 + cmp.ge p10,p0 = sind_r_exp,sind_exp_limit +} +;; + +// sind_W = x * sind_Inv_Pi_by_16 +// Multiply x by scaled 16/pi and add large const to shift integer part of W to +// rightmost bits of significand +{ .mfi + ldfpd sind_P1,sind_Q1 = [sind_AD_1] + fma.s1 sind_W_2TO61_RSH = sind_NORM_f8,sind_SIG_INV_PI_BY_16_2TO61,sind_RSHF_2TO61 + nop.i 999 +} +{ .mbb +(p10) cmp.ne.unc p11,p12=sind_r_sincos,r0 // p11 call __libm_cos_double_dbx + // p12 call __libm_sin_double_dbx +(p11) br.cond.spnt L(COSD_DBX) +(p12) br.cond.spnt L(SIND_DBX) +} +;; + + +// sind_NFLOAT = Round_Int_Nearest(sind_W) +// This is done by scaling back by 2^-61 and subtracting the shift constant +{ .mfi + nop.m 999 + fms.s1 sind_NFLOAT = sind_W_2TO61_RSH,sind_2TOM61,sind_RSHF + nop.i 999 ;; +} + + +// get N = (int)sind_int_Nfloat +{ .mfi + getf.sig sind_GR_n = sind_W_2TO61_RSH + nop.f 999 + nop.i 999 ;; +} + +// Add 2^(k-1) (which is in sind_r_sincos) to N +// sind_r = -sind_Nfloat * sind_Pi_by_16_hi + x +// sind_r = sind_r -sind_Nfloat * sind_Pi_by_16_lo +{ .mfi + add sind_GR_n = sind_GR_n, sind_r_sincos + fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_hi, sind_NORM_f8 + nop.i 999 ;; +} + + +// Get M (least k+1 bits of N) +{ .mmi + and sind_GR_m = 0x1f,sind_GR_n ;; + nop.m 999 + shl sind_GR_32m = sind_GR_m,5 ;; +} + +// Add 32*M to address of sin_cos_beta table +{ .mmi + add sind_AD_2 = sind_GR_32m, sind_AD_beta_table + nop.m 999 + nop.i 999 ;; +} + +{ .mfi + ldfe sind_Sm = [sind_AD_2],16 +(p8) fclass.m.unc p10,p0=f8,0x0b // If sin, note denormal input to set uflow + nop.i 999 ;; +} + +{ .mfi + ldfe sind_Cm = [sind_AD_2] + fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_lo, sind_r + nop.i 999 ;; +} + +// get rsq +{ .mfi + nop.m 999 + fma.s1 sind_rsq = sind_r, sind_r, f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fmpy.s0 fp_tmp = fp_tmp,fp_tmp // fmpy forces inexact flag + nop.i 999 ;; +} + +// form P and Q series +{ .mfi + nop.m 999 + fma.s1 sind_P_temp1 = sind_rsq, sind_P4, sind_P3 + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 sind_Q_temp1 = sind_rsq, sind_Q4, sind_Q3 + nop.i 999 ;; +} + +// get rcube and sm*rsq +{ .mfi + nop.m 999 + fmpy.s1 sind_srsq = sind_Sm,sind_rsq + nop.i 999 +} + +{ .mfi + nop.m 999 + fmpy.s1 sind_rcub = sind_r, sind_rsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 sind_Q_temp2 = sind_rsq, sind_Q_temp1, sind_Q2 + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 sind_P_temp2 = sind_rsq, sind_P_temp1, sind_P2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fma.s1 sind_Q = sind_rsq, sind_Q_temp2, sind_Q1 + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 sind_P = sind_rsq, sind_P_temp2, sind_P1 + nop.i 999 ;; +} + +// Get final P and Q +{ .mfi + nop.m 999 + fma.s1 sind_Q = sind_srsq,sind_Q, sind_Sm + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 sind_P = sind_rcub,sind_P, sind_r + nop.i 999 ;; +} + +// If sin(denormal), force inexact to be set +{ .mfi + nop.m 999 +(p10) fmpy.d.s0 fp_tmp = f8,f8 + nop.i 999 ;; +} + +// Final calculation +{ .mfb + nop.m 999 + fma.d f8 = sind_Cm, sind_P, sind_Q + br.ret.sptk b0 ;; +} +.endp cos# +ASM_SIZE_DIRECTIVE(cos#) + + + +.proc __libm_callout_1s +__libm_callout_1s: +L(SIND_DBX): +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mib + nop.m 999 + nop.i 999 + br.call.sptk.many b0=__libm_sin_double_dbx# ;; +} +;; + + +{ .mfi + mov gp = GR_SAVE_GP + nop.f 999 + mov b0 = GR_SAVE_B0 +} +;; + +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 ;; +} +.endp __libm_callout_1s +ASM_SIZE_DIRECTIVE(__libm_callout_1s) + + +.proc __libm_callout_1c +__libm_callout_1c: +L(COSD_DBX): +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mib + nop.m 999 + nop.i 999 + br.call.sptk.many b0=__libm_cos_double_dbx# ;; +} +;; + + +{ .mfi + mov gp = GR_SAVE_GP + nop.f 999 + mov b0 = GR_SAVE_B0 +} +;; + +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 ;; +} +.endp __libm_callout_1c +ASM_SIZE_DIRECTIVE(__libm_callout_1c) + + +// ==================================================================== +// ==================================================================== + +// These functions calculate the sin and cos for inputs +// greater than 2^10 +// __libm_sin_double_dbx# and __libm_cos_double_dbx# + +// ********************************************************************* +// ********************************************************************* +// +// Function: Combined sin(x) and cos(x), where +// +// sin(x) = sine(x), for double precision x values +// cos(x) = cosine(x), for double precision x values +// +// ********************************************************************* +// +// Accuracy: Within .7 ulps for 80-bit floating point values +// Very accurate for double precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f32-f99 +// +// General Purpose Registers: +// r32-r43 +// r44-r45 (Used to pass arguments to pi_by_2 reduce routine) +// +// Predicate Registers: p6-p13 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions do not occur +// Underflow exceptions raised when appropriate for sin +// (No specialized error handling for this routine) +// Inexact raised when appropriate by algorithm +// +// sin(SNaN) = QNaN +// sin(QNaN) = QNaN +// sin(inf) = QNaN +// sin(+/-0) = +/-0 +// cos(inf) = QNaN +// cos(SNaN) = QNaN +// cos(QNaN) = QNaN +// cos(0) = 1 +// +// ********************************************************************* +// +// Mathematical Description +// ======================== +// +// The computation of FSIN and FCOS is best handled in one piece of +// code. The main reason is that given any argument Arg, computation +// of trigonometric functions first calculate N and an approximation +// to alpha where +// +// Arg = N pi/2 + alpha, |alpha| <= pi/4. +// +// Since +// +// cos( Arg ) = sin( (N+1) pi/2 + alpha ), +// +// therefore, the code for computing sine will produce cosine as long +// as 1 is added to N immediately after the argument reduction +// process. +// +// Let M = N if sine +// N+1 if cosine. +// +// Now, given +// +// Arg = M pi/2 + alpha, |alpha| <= pi/4, +// +// let I = M mod 4, or I be the two lsb of M when M is represented +// as 2's complement. I = [i_0 i_1]. Then +// +// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0, +// = (-1)^i_0 cos( alpha ) if i_1 = 1. +// +// For example: +// if M = -1, I = 11 +// sin ((-pi/2 + alpha) = (-1) cos (alpha) +// if M = 0, I = 00 +// sin (alpha) = sin (alpha) +// if M = 1, I = 01 +// sin (pi/2 + alpha) = cos (alpha) +// if M = 2, I = 10 +// sin (pi + alpha) = (-1) sin (alpha) +// if M = 3, I = 11 +// sin ((3/2)pi + alpha) = (-1) cos (alpha) +// +// The value of alpha is obtained by argument reduction and +// represented by two working precision numbers r and c where +// +// alpha = r + c accurately. +// +// The reduction method is described in a previous write up. +// The argument reduction scheme identifies 4 cases. For Cases 2 +// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be +// computed very easily by 2 or 3 terms of the Taylor series +// expansion as follows: +// +// Case 2: +// ------- +// +// sin(r + c) = r + c - r^3/6 accurately +// cos(r + c) = 1 - 2^(-67) accurately +// +// Case 4: +// ------- +// +// sin(r + c) = r + c - r^3/6 + r^5/120 accurately +// cos(r + c) = 1 - r^2/2 + r^4/24 accurately +// +// The only cases left are Cases 1 and 3 of the argument reduction +// procedure. These two cases will be merged since after the +// argument is reduced in either cases, we have the reduced argument +// represented as r + c and that the magnitude |r + c| is not small +// enough to allow the usage of a very short approximation. +// +// The required calculation is either +// +// sin(r + c) = sin(r) + correction, or +// cos(r + c) = cos(r) + correction. +// +// Specifically, +// +// sin(r + c) = sin(r) + c sin'(r) + O(c^2) +// = sin(r) + c cos (r) + O(c^2) +// = sin(r) + c(1 - r^2/2) accurately. +// Similarly, +// +// cos(r + c) = cos(r) - c sin(r) + O(c^2) +// = cos(r) - c(r - r^3/6) accurately. +// +// We therefore concentrate on accurately calculating sin(r) and +// cos(r) for a working-precision number r, |r| <= pi/4 to within +// 0.1% or so. +// +// The greatest challenge of this task is that the second terms of +// the Taylor series +// +// r - r^3/3! + r^r/5! - ... +// +// and +// +// 1 - r^2/2! + r^4/4! - ... +// +// are not very small when |r| is close to pi/4 and the rounding +// errors will be a concern if simple polynomial accumulation is +// used. When |r| < 2^-3, however, the second terms will be small +// enough (6 bits or so of right shift) that a normal Horner +// recurrence suffices. Hence there are two cases that we consider +// in the accurate computation of sin(r) and cos(r), |r| <= pi/4. +// +// Case small_r: |r| < 2^(-3) +// -------------------------- +// +// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], +// we have +// +// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 +// = (-1)^i_0 * cos(r + c) if i_1 = 1 +// +// can be accurately approximated by +// +// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0 +// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1 +// +// because |r| is small and thus the second terms in the correction +// are unneccessary. +// +// Finally, sin(r) and cos(r) are approximated by polynomials of +// moderate lengths. +// +// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 +// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 +// +// We can make use of predicates to selectively calculate +// sin(r) or cos(r) based on i_1. +// +// Case normal_r: 2^(-3) <= |r| <= pi/4 +// ------------------------------------ +// +// This case is more likely than the previous one if one considers +// r to be uniformly distributed in [-pi/4 pi/4]. Again, +// +// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 +// = (-1)^i_0 * cos(r + c) if i_1 = 1. +// +// Because |r| is now larger, we need one extra term in the +// correction. sin(Arg) can be accurately approximated by +// +// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0 +// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1. +// +// Finally, sin(r) and cos(r) are approximated by polynomials of +// moderate lengths. +// +// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + +// PP_2 r^5 + ... + PP_8 r^17 +// +// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 +// +// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. +// The crux in accurate computation is to calculate +// +// r + PP_1_hi r^3 or 1 + QQ_1 r^2 +// +// accurately as two pieces: U_hi and U_lo. The way to achieve this +// is to obtain r_hi as a 10 sig. bit number that approximates r to +// roughly 8 bits or so of accuracy. (One convenient way is +// +// r_hi := frcpa( frcpa( r ) ).) +// +// This way, +// +// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + +// PP_1_hi (r^3 - r_hi^3) +// = [r + PP_1_hi r_hi^3] + +// [PP_1_hi (r - r_hi) +// (r^2 + r_hi r + r_hi^2) ] +// = U_hi + U_lo +// +// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, +// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed +// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign +// and that there is no more than 8 bit shift off between r and +// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus +// calculated without any error. Finally, the fact that +// +// |U_lo| <= 2^(-8) |U_hi| +// +// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly +// 8 extra bits of accuracy. +// +// Similarly, +// +// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + +// [QQ_1 (r - r_hi)(r + r_hi)] +// = U_hi + U_lo. +// +// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). +// +// If i_1 = 0, then +// +// U_hi := r + PP_1_hi * r_hi^3 +// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2) +// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17 +// correction := c * ( 1 + C_1 r^2 ) +// +// Else ...i_1 = 1 +// +// U_hi := 1 + QQ_1 * r_hi * r_hi +// U_lo := QQ_1 * (r - r_hi) * (r + r_hi) +// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16 +// correction := -c * r * (1 + S_1 * r^2) +// +// End +// +// Finally, +// +// V := poly + ( U_lo + correction ) +// +// / U_hi + V if i_0 = 0 +// result := | +// \ (-U_hi) - V if i_0 = 1 +// +// It is important that in the last step, negation of U_hi is +// performed prior to the subtraction which is to be performed in +// the user-set rounding mode. +// +// +// Algorithmic Description +// ======================= +// +// The argument reduction algorithm is tightly integrated into FSIN +// and FCOS which share the same code. The following is complete and +// self-contained. The argument reduction description given +// previously is repeated below. +// +// +// Step 0. Initialization. +// +// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, +// set N_inc := 1. +// +// Step 1. Check for exceptional and special cases. +// +// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special +// handling. +// * If |Arg| < 2^24, go to Step 2 for reduction of moderate +// arguments. This is the most likely case. +// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large +// arguments. +// * If |Arg| >= 2^63, go to Step 10 for special handling. +// +// Step 2. Reduction of moderate arguments. +// +// If |Arg| < pi/4 ...quick branch +// N_fix := N_inc (integer) +// r := Arg +// c := 0.0 +// Branch to Step 4, Case_1_complete +// Else ...cf. argument reduction +// N := Arg * two_by_PI (fp) +// N_fix := fcvt.fx( N ) (int) +// N := fcvt.xf( N_fix ) +// N_fix := N_fix + N_inc +// s := Arg - N * P_1 (first piece of pi/2) +// w := -N * P_2 (second piece of pi/2) +// +// If |s| >= 2^(-33) +// go to Step 3, Case_1_reduce +// Else +// go to Step 7, Case_2_reduce +// Endif +// Endif +// +// Step 3. Case_1_reduce. +// +// r := s + w +// c := (s - r) + w ...observe order +// +// Step 4. Case_1_complete +// +// ...At this point, the reduced argument alpha is +// ...accurately represented as r + c. +// If |r| < 2^(-3), go to Step 6, small_r. +// +// Step 5. Normal_r. +// +// Let [i_0 i_1] by the 2 lsb of N_fix. +// FR_rsq := r * r +// r_hi := frcpa( frcpa( r ) ) +// r_lo := r - r_hi +// +// If i_1 = 0, then +// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) +// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order +// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) +// correction := c + c*C_1*FR_rsq ...any order +// Else +// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) +// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order +// U_lo := QQ_1 * r_lo * (r + r_hi) +// correction := -c*(r + S_1*FR_rsq*r) ...any order +// Endif +// +// V := poly + (U_lo + correction) ...observe order +// +// result := (i_0 == 0? 1.0 : -1.0) +// +// Last instruction in user-set rounding mode +// +// result := (i_0 == 0? result*U_hi + V : +// result*U_hi - V) +// +// Return +// +// Step 6. Small_r. +// +// ...Use flush to zero mode without causing exception +// Let [i_0 i_1] be the two lsb of N_fix. +// +// FR_rsq := r * r +// +// If i_1 = 0 then +// z := FR_rsq*FR_rsq; z := FR_rsq*z *r +// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5) +// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2) +// correction := c +// result := r +// Else +// z := FR_rsq*FR_rsq; z := FR_rsq*z +// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) +// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) +// correction := -c*r +// result := 1 +// Endif +// +// poly := poly_hi + (z * poly_lo + correction) +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// Return +// +// Step 7. Case_2_reduce. +// +// ...Refer to the write up for argument reduction for +// ...rationale. The reduction algorithm below is taken from +// ...argument reduction description and integrated this. +// +// w := N*P_3 +// U_1 := N*P_2 + w ...FMA +// U_2 := (N*P_2 - U_1) + w ...2 FMA +// ...U_1 + U_2 is N*(P_2+P_3) accurately +// +// r := s - U_1 +// c := ( (s - r) - U_1 ) - U_2 +// +// ...The mathematical sum r + c approximates the reduced +// ...argument accurately. Note that although compared to +// ...Case 1, this case requires much more work to reduce +// ...the argument, the subsequent calculation needed for +// ...any of the trigonometric function is very little because +// ...|alpha| < 1.01*2^(-33) and thus two terms of the +// ...Taylor series expansion suffices. +// +// If i_1 = 0 then +// poly := c + S_1 * r * r * r ...any order +// result := r +// Else +// poly := -2^(-67) +// result := 1.0 +// Endif +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// +// Return +// +// +// Step 8. Pre-reduction of large arguments. +// +// ...Again, the following reduction procedure was described +// ...in the separate write up for argument reduction, which +// ...is tightly integrated here. + +// N_0 := Arg * Inv_P_0 +// N_0_fix := fcvt.fx( N_0 ) +// N_0 := fcvt.xf( N_0_fix) + +// Arg' := Arg - N_0 * P_0 +// w := N_0 * d_1 +// N := Arg' * two_by_PI +// N_fix := fcvt.fx( N ) +// N := fcvt.xf( N_fix ) +// N_fix := N_fix + N_inc +// +// s := Arg' - N * P_1 +// w := w - N * P_2 +// +// If |s| >= 2^(-14) +// go to Step 3 +// Else +// go to Step 9 +// Endif +// +// Step 9. Case_4_reduce. +// +// ...first obtain N_0*d_1 and -N*P_2 accurately +// U_hi := N_0 * d_1 V_hi := -N*P_2 +// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs +// +// ...compute the contribution from N_0*d_1 and -N*P_3 +// w := -N*P_3 +// w := w + N_0*d_2 +// t := U_lo + V_lo + w ...any order +// +// ...at this point, the mathematical value +// ...s + U_hi + V_hi + t approximates the true reduced argument +// ...accurately. Just need to compute this accurately. +// +// ...Calculate U_hi + V_hi accurately: +// A := U_hi + V_hi +// if |U_hi| >= |V_hi| then +// a := (U_hi - A) + V_hi +// else +// a := (V_hi - A) + U_hi +// endif +// ...order in computing "a" must be observed. This branch is +// ...best implemented by predicates. +// ...A + a is U_hi + V_hi accurately. Moreover, "a" is +// ...much smaller than A: |a| <= (1/2)ulp(A). +// +// ...Just need to calculate s + A + a + t +// C_hi := s + A t := t + a +// C_lo := (s - C_hi) + A +// C_lo := C_lo + t +// +// ...Final steps for reduction +// r := C_hi + C_lo +// c := (C_hi - r) + C_lo +// +// ...At this point, we have r and c +// ...And all we need is a couple of terms of the corresponding +// ...Taylor series. +// +// If i_1 = 0 +// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2) +// result := r +// Else +// poly := FR_rsq*(C_1 + FR_rsq*C_2) +// result := 1 +// Endif +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// Return +// +// Large Arguments: For arguments above 2**63, a Payne-Hanek +// style argument reduction is used and pi_by_2 reduce is called. +// + + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 64 + +FSINCOS_CONSTANTS: +ASM_TYPE_DIRECTIVE(FSINCOS_CONSTANTS,@object) +data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24 +data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2 +data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0 +data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1 +data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2 +data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3 +data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63 +data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0 +data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1 +data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2 +data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4 +data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4 +data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3 +data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67 +data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8 +data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7 +data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6 +data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5 +data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 +data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi +data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4 +data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3 +data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2 +data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo +data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8 +data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7 +data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6 +data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5 +data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 +data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1 +data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4 +data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3 +data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2 +data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 +data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2 +data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3 +data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4 +data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5 +data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 +data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2 +data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3 +data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4 +data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5 +data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14 +ASM_SIZE_DIRECTIVE(FSINCOS_CONSTANTS) + +FR_Input_X = f8 +FR_Neg_Two_to_M3 = f32 +FR_Two_to_63 = f32 +FR_Two_to_24 = f33 +FR_Pi_by_4 = f33 +FR_Two_to_M14 = f34 +FR_Two_to_M33 = f35 +FR_Neg_Two_to_24 = f36 +FR_Neg_Pi_by_4 = f36 +FR_Neg_Two_to_M14 = f37 +FR_Neg_Two_to_M33 = f38 +FR_Neg_Two_to_M67 = f39 +FR_Inv_pi_by_2 = f40 +FR_N_float = f41 +FR_N_fix = f42 +FR_P_1 = f43 +FR_P_2 = f44 +FR_P_3 = f45 +FR_s = f46 +FR_w = f47 +FR_c = f48 +FR_r = f49 +FR_Z = f50 +FR_A = f51 +FR_a = f52 +FR_t = f53 +FR_U_1 = f54 +FR_U_2 = f55 +FR_C_1 = f56 +FR_C_2 = f57 +FR_C_3 = f58 +FR_C_4 = f59 +FR_C_5 = f60 +FR_S_1 = f61 +FR_S_2 = f62 +FR_S_3 = f63 +FR_S_4 = f64 +FR_S_5 = f65 +FR_poly_hi = f66 +FR_poly_lo = f67 +FR_r_hi = f68 +FR_r_lo = f69 +FR_rsq = f70 +FR_r_cubed = f71 +FR_C_hi = f72 +FR_N_0 = f73 +FR_d_1 = f74 +FR_V = f75 +FR_V_hi = f75 +FR_V_lo = f76 +FR_U_hi = f77 +FR_U_lo = f78 +FR_U_hiabs = f79 +FR_V_hiabs = f80 +FR_PP_8 = f81 +FR_QQ_8 = f81 +FR_PP_7 = f82 +FR_QQ_7 = f82 +FR_PP_6 = f83 +FR_QQ_6 = f83 +FR_PP_5 = f84 +FR_QQ_5 = f84 +FR_PP_4 = f85 +FR_QQ_4 = f85 +FR_PP_3 = f86 +FR_QQ_3 = f86 +FR_PP_2 = f87 +FR_QQ_2 = f87 +FR_QQ_1 = f88 +FR_N_0_fix = f89 +FR_Inv_P_0 = f90 +FR_corr = f91 +FR_poly = f92 +FR_d_2 = f93 +FR_Two_to_M3 = f94 +FR_Neg_Two_to_63 = f94 +FR_P_0 = f95 +FR_C_lo = f96 +FR_PP_1 = f97 +FR_PP_1_lo = f98 +FR_ArgPrime = f99 + +GR_Table_Base = r32 +GR_Table_Base1 = r33 +GR_i_0 = r34 +GR_i_1 = r35 +GR_N_Inc = r36 +GR_Sin_or_Cos = r37 + +GR_SAVE_B0 = r39 +GR_SAVE_GP = r40 +GR_SAVE_PFS = r41 + +.section .text +.proc __libm_sin_double_dbx# +.align 64 +__libm_sin_double_dbx: + +{ .mlx +alloc GR_Table_Base = ar.pfs,0,12,2,0 + movl GR_Sin_or_Cos = 0x0 ;; +} + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +{ .mib + nop.m 999 + nop.i 999 + br.cond.sptk L(SINCOS_CONTINUE) ;; +} + +.endp __libm_sin_double_dbx# +ASM_SIZE_DIRECTIVE(__libm_sin_double_dbx) + +.section .text +.proc __libm_cos_double_dbx# +__libm_cos_double_dbx: + +{ .mlx +alloc GR_Table_Base= ar.pfs,0,12,2,0 + movl GR_Sin_or_Cos = 0x1 ;; +} + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + +// +// Load Table Address +// +L(SINCOS_CONTINUE): + +{ .mmi + add GR_Table_Base1 = 96, GR_Table_Base + ldfs FR_Two_to_24 = [GR_Table_Base], 4 + nop.i 999 +} +;; + +{ .mmi + nop.m 999 +// +// Load 2**24, load 2**63. +// + ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 + mov r41 = ar.pfs ;; +} + +{ .mfi + ldfs FR_Two_to_63 = [GR_Table_Base1], 4 +// +// Check for unnormals - unsupported operands. We do not want +// to generate denormal exception +// Check for NatVals, QNaNs, SNaNs, +/-Infs +// Check for EM unsupporteds +// Check for Zero +// + fclass.m.unc p6, p8 = FR_Input_X, 0x1E3 + mov r40 = gp ;; +} + +{ .mfi + nop.m 999 + fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF +// GR_Sin_or_Cos denotes + mov r39 = b0 +} + +{ .mfb + ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 + fclass.m.unc p10, p0 = FR_Input_X, 0x007 +(p6) br.cond.spnt L(SINCOS_SPECIAL) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(SINCOS_SPECIAL) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Branch if +/- NaN, Inf. +// Load -2**24, load -2**63. +// +(p10) br.cond.spnt L(SINCOS_ZERO) ;; +} + +{ .mmb + ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 + ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 + nop.b 999 ;; +} + +{ .mmb + nop.m 999 + ldfe FR_d_1 = [GR_Table_Base1], 16 + nop.b 999 ;; +} +// +// Raise possible denormal operand flag with useful fcmp +// Is x <= -2**63 +// Load Inv_P_0 for pre-reduction +// Load Inv_pi_by_2 +// + +{ .mmb + ldfe FR_P_0 = [GR_Table_Base], 16 + ldfe FR_d_2 = [GR_Table_Base1], 16 + nop.b 999 ;; +} +// +// Load P_0 +// Load d_1 +// Is x >= 2**63 +// Is x <= -2**24? +// + +{ .mmi + ldfe FR_P_1 = [GR_Table_Base], 16 ;; +// +// Load P_1 +// Load d_2 +// Is x >= 2**24? +// + ldfe FR_P_2 = [GR_Table_Base], 16 + nop.i 999 ;; +} + +{ .mmf + nop.m 999 + ldfe FR_P_3 = [GR_Table_Base], 16 + fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 +} + +{ .mfi + nop.m 999 +// +// Branch if +/- zero. +// Decide about the paths to take: +// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 +// OTHERWISE - CASE 3 OR 4 +// + fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 + nop.i 999 +} + +{ .mfi + ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 +(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 + nop.i 999 ;; +} + +{ .mmi + ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; + ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 + nop.i 999 ;; +} + +{ .mib + ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 + nop.i 999 +// +// Load P_2 +// Load P_3 +// Load pi_by_4 +// Load neg_pi_by_4 +// Load 2**(-3) +// Load -2**(-3). +// +(p10) br.cond.spnt L(SINCOS_ARG_TOO_LARGE) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Branch out if x >= 2**63. Use Payne-Hanek Reduction +// +(p7) br.cond.spnt L(SINCOS_LARGER_ARG) ;; +} + +{ .mfi + nop.m 999 +// +// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. +// + fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Select the case when |Arg| < pi/4 +// Else Select the case when |Arg| >= pi/4 +// + fcvt.fx.s1 FR_N_fix = FR_N_float + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N = Arg * 2/pi +// Check if Arg < pi/4 +// +(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 + nop.i 999 ;; +} +// +// Case 2: Convert integer N_fix back to normalized floating-point value. +// Case 1: p8 is only affected when p6 is set +// + +{ .mfi +(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 +// +// Grab the integer part of N and call it N_fix +// +(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X +// If |x| < pi/4, r = x and c = 0 +// lf |x| < pi/4, is x < 2**(-3). +// r = Arg +// c = 0 +(p6) mov GR_N_Inc = GR_Sin_or_Cos ;; +} + +{ .mmf + nop.m 999 +(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 +(p6) fmerge.se FR_c = f0, f0 +} + +{ .mfi + nop.m 999 +(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. +// If |x| >= pi/4, +// Create the right N for |x| < pi/4 and otherwise +// Case 2: Place integer part of N in GP register +// +(p7) fcvt.xf FR_N_float = FR_N_fix + nop.i 999 ;; +} + +{ .mmf + nop.m 999 +(p7) getf.sig GR_N_Inc = FR_N_fix +(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Load 2**(-33), -2**(-33) +// +(p8) br.cond.spnt L(SINCOS_SMALL_R) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.sptk L(SINCOS_NORMAL_R) ;; +} +// +// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. +// +// +// In this branch, |x| >= pi/4. +// + +{ .mfi + ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 +// +// Load -2**(-67) +// + fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X +// +// w = N * P_2 +// s = -N * P_1 + Arg +// + add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos +} + +{ .mfi + nop.m 999 + fma.s1 FR_w = FR_N_float, FR_P_2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Adjust N_fix by N_inc to determine whether sine or +// cosine is being calculated +// + fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// Remember x >= pi/4. +// Is s <= -2**(-33) or s >= 2**(-33) (p6) +// or -2**(-33) < s < 2**(-33) (p7) +(p6) fms.s1 FR_r = FR_s, f1, FR_w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p6) fms.s1 FR_c = FR_s, f1, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For big s: r = s - w: No futher reduction is necessary +// For small s: w = N * P_3 (change sign) More reduction +// +(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fms.s1 FR_r = FR_s, f1, FR_U_1 + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// For big s: Is |r| < 2**(-3)? +// For big s: c = S - r +// For small s: U_1 = N * P_2 + w +// +// If p8 is set, prepare to branch to Small_R. +// If p9 is set, prepare to branch to Normal_R. +// For big s, r is complete here. +// +(p6) fms.s1 FR_c = FR_c, f1, FR_w +// +// For big s: c = c + w (w has not been negated.) +// For small s: r = S - U_1 +// +(p8) br.cond.spnt L(SINCOS_SMALL_R) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.sptk L(SINCOS_NORMAL_R) ;; +} + +{ .mfi +(p7) add GR_Table_Base1 = 224, GR_Table_Base1 +// +// Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R +// +(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 +// +// c = S - U_1 +// r = S_1 * r +// +// +(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 +} + +{ .mmi + nop.m 999 ;; +// +// Get [i_0,i_1] - two lsb of N_fix_gr. +// Do dummy fmpy so inexact is always set. +// +(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 +(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} +// +// For small s: U_2 = N * P_2 - U_1 +// S_1 stored constant - grab the one stored with the +// coefficients. +// + +{ .mfi +(p7) ldfe FR_S_1 = [GR_Table_Base1], 16 +// +// Check if i_1 and i_0 != 0 +// +(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 +(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; +} + +{ .mfi + nop.m 999 +(p7) fms.s1 FR_s = FR_s, f1, FR_r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// S = S - r +// U_2 = U_2 + w +// load S_1 +// +(p7) fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fmerge.se FR_Input_X = FR_r, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_Input_X = f0, f1, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// FR_rsq = r * r +// Save r as the result. +// +(p7) fms.s1 FR_c = FR_s, f1, FR_U_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if ( i_1 ==0) poly = c + S_1*r*r*r +// else Result = 1 +// +(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fma.s1 FR_r = FR_S_1, FR_r, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fma.d.s0 FR_S_1 = FR_S_1, FR_S_1, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// If i_1 != 0, poly = 2**(-67) +// +(p7) fms.s1 FR_c = FR_c, f1, FR_U_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// c = c - U_2 +// +(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// i_0 != 0, so Result = -Result +// +(p11) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p12) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly +// +// if (i_0 == 0), Result = Result + poly +// else Result = Result - poly +// + br.ret.sptk b0 ;; +} +L(SINCOS_LARGER_ARG): + +{ .mfi + nop.m 999 + fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 + nop.i 999 +} +;; + +// This path for argument > 2*24 +// Adjust table_ptr1 to beginning of table. +// + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +// +// Point to 2*-14 +// N_0 = Arg * Inv_P_0 +// + +{ .mmi + add GR_Table_Base = 688, GR_Table_Base ;; + ldfs FR_Two_to_M14 = [GR_Table_Base], 4 + nop.i 999 ;; +} + +{ .mfi + ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load values 2**(-14) and -2**(-14) +// + fcvt.fx.s1 FR_N_0_fix = FR_N_0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N_0_fix = integer part of N_0 +// + fcvt.xf FR_N_0 = FR_N_0_fix + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Make N_0 the integer part +// + fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X + nop.i 999 +} + +{ .mfi + nop.m 999 + fma.s1 FR_w = FR_N_0, FR_d_1, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Arg' = -N_0 * P_0 + Arg +// w = N_0 * d_1 +// + fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N = A' * 2/pi +// + fcvt.fx.s1 FR_N_fix = FR_N_float + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N_fix is the integer part +// + fcvt.xf FR_N_float = FR_N_fix + nop.i 999 ;; +} + +{ .mfi + getf.sig GR_N_Inc = FR_N_fix + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; + add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; +} + +{ .mfi + nop.m 999 +// +// N is the integer part of the reduced-reduced argument. +// Put the integer in a GP register +// + fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime + nop.i 999 +} + +{ .mfi + nop.m 999 + fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// s = -N*P_1 + Arg' +// w = -N*P_2 + w +// N_fix_gr = N_fix_gr + N_inc +// + fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For |s| > 2**(-14) r = S + w (r complete) +// Else U_hi = N_0 * d_1 +// +(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Either S <= -2**(-14) or S >= 2**(-14) +// or -2**(-14) < s < 2**(-14) +// +(p8) fma.s1 FR_r = FR_s, f1, FR_w + nop.i 999 +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// We need abs of both U_hi and V_hi - don't +// worry about switched sign of V_hi. +// +(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Big s: finish up c = (S - r) + w (c complete) +// Case 4: A = U_hi + V_hi +// Note: Worry about switched sign of V_hi, so subtract instead of add. +// +(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi + nop.i 999 +} + +{ .mfi + nop.m 999 +// For big s: c = S - r +// For small s do more work: U_lo = N_0 * d_1 - U_hi +// +(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For big s: Is |r| < 2**(-3) +// For big s: if p12 set, prepare to branch to Small_R. +// For big s: If p13 set, prepare to branch to Normal_R. +// +(p8) fms.s1 FR_c = FR_s, f1, FR_r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// For small S: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w. +// +(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p8) fma.s1 FR_c = FR_c, f1, FR_w + nop.i 999 +} + +{ .mfb + nop.m 999 +(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w +(p12) br.cond.spnt L(SINCOS_SMALL_R) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.sptk L(SINCOS_NORMAL_R) ;; +} + +{ .mfi + nop.m 999 +// +// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. +// The remaining stuff is for Case 4. +// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_lo. +// Small s: w = w + N_0 * d_2 +// Note: the (-) is now incorporated in w. +// +(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs + extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +} + +{ .mfi + nop.m 999 +// +// C_hi = S + A +// +(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo + extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} + +{ .mfi + nop.m 999 +// +// t = U_lo + V_lo +// +// +(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A + nop.i 999 +} +;; + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi + add GR_Table_Base = 528, GR_Table_Base +// +// Is U_hiabs >= V_hiabs? +// +(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A + nop.i 999 ;; +} + +{ .mmi + ldfe FR_C_1 = [GR_Table_Base], 16 ;; + ldfe FR_C_2 = [GR_Table_Base], 64 + nop.i 999 ;; +} + +{ .mmf + nop.m 999 +// +// c = c + C_lo finished. +// Load C_2 +// + ldfe FR_S_1 = [GR_Table_Base], 16 +// +// C_lo = S - C_hi +// + fma.s1 FR_t = FR_t, f1, FR_w ;; +} +// +// r and c have been computed. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-3) +// Get [i_0,i_1] - two lsb of N_fix. +// Load S_1 +// + +{ .mfi + ldfe FR_S_2 = [GR_Table_Base], 64 +// +// t = t + w +// +(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi + cmp.eq.unc p9, p10 = 0x0, GR_i_0 +} + +{ .mfi + nop.m 999 +// +// For larger u than v: a = U_hi - A +// Else a = V_hi - A (do an add to account for missing (-) on V_hi +// + fms.s1 FR_C_lo = FR_s, f1, FR_C_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a + cmp.eq.unc p11, p12 = 0x0, GR_i_1 +} + +{ .mfi + nop.m 999 +// +// If u > v: a = (U_hi - A) + V_hi +// Else a = (V_hi - A) + U_hi +// In each case account for negative missing from V_hi. +// + fma.s1 FR_C_lo = FR_C_lo, f1, FR_A + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// C_lo = (S - C_hi) + A +// + fma.s1 FR_t = FR_t, f1, FR_a + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// t = t + a +// + fma.s1 FR_C_lo = FR_C_lo, f1, FR_t + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// C_lo = C_lo + t +// Adjust Table_Base to beginning of table +// + fma.s1 FR_r = FR_C_hi, f1, FR_C_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load S_2 +// + fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Table_Base points to C_1 +// r = C_hi + C_lo +// + fms.s1 FR_c = FR_C_hi, f1, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if i_1 ==0: poly = S_2 * FR_rsq + S_1 +// else poly = C_2 * FR_rsq + C_1 +// +(p11) fma.s1 FR_Input_X = f0, f1, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 FR_Input_X = f0, f1, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Compute r_cube = FR_rsq * r +// +(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Compute FR_rsq = r * r +// Is i_1 == 0 ? +// + fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// c = C_hi - r +// Load C_1 +// + fma.s1 FR_c = FR_c, f1, FR_C_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// if i_1 ==0: poly = r_cube * poly + c +// else poly = FR_rsq * poly +// +(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if i_1 ==0: Result = r +// else Result = 1.0 +// +(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if i_0 !=0: Result = -Result +// +(p9) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p10) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly +// +// if i_0 == 0: Result = Result + poly +// else Result = Result - poly +// + br.ret.sptk b0 ;; +} +L(SINCOS_SMALL_R): + +{ .mii + nop.m 999 + extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +// +// +// Compare both i_1 and i_0 with 0. +// if i_1 == 0, set p9. +// if i_0 == 0, set p11. +// + cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; +} + +{ .mfi + nop.m 999 + fma.s1 FR_rsq = FR_r, FR_r, f0 + extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} + +{ .mfi + nop.m 999 +// +// Z = Z * FR_rsq +// +(p10) fnma.s1 FR_c = FR_c, FR_r, f0 + cmp.eq.unc p11, p12 = 0x0, GR_i_0 +} +;; + +// ****************************************************************** +// ****************************************************************** +// ****************************************************************** +// r and c have been computed. +// We know whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-3) +// +// Set table_ptr1 to beginning of constant table. +// Get [i_0,i_1] - two lsb of N_fix_gr. +// + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +// +// Set table_ptr1 to point to S_5. +// Set table_ptr1 to point to C_5. +// Compute FR_rsq = r * r +// + +{ .mfi +(p9) add GR_Table_Base = 672, GR_Table_Base +(p10) fmerge.s FR_r = f1, f1 +(p10) add GR_Table_Base = 592, GR_Table_Base ;; +} +// +// Set table_ptr1 to point to S_5. +// Set table_ptr1 to point to C_5. +// + +{ .mmi +(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; +// +// if (i_1 == 0) load S_5 +// if (i_1 != 0) load C_5 +// +(p9) ldfe FR_S_4 = [GR_Table_Base], -16 + nop.i 999 ;; +} + +{ .mmf +(p10) ldfe FR_C_5 = [GR_Table_Base], -16 +// +// Z = FR_rsq * FR_rsq +// +(p9) ldfe FR_S_3 = [GR_Table_Base], -16 +// +// Compute FR_rsq = r * r +// if (i_1 == 0) load S_4 +// if (i_1 != 0) load C_4 +// + fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; +} +// +// if (i_1 == 0) load S_3 +// if (i_1 != 0) load C_3 +// + +{ .mmi +(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; +// +// if (i_1 == 0) load S_2 +// if (i_1 != 0) load C_2 +// +(p9) ldfe FR_S_1 = [GR_Table_Base], -16 + nop.i 999 +} + +{ .mmi +(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; +(p10) ldfe FR_C_3 = [GR_Table_Base], -16 + nop.i 999 ;; +} + +{ .mmi +(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; +(p10) ldfe FR_C_1 = [GR_Table_Base], -16 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// if (i_1 != 0): +// poly_lo = FR_rsq * C_5 + C_4 +// poly_hi = FR_rsq * C_2 + C_1 +// +(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1 == 0) load S_1 +// if (i_1 != 0) load C_1 +// +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// c = -c * r +// dummy fmpy's to flag inexact. +// +(p9) fma.d.s0 FR_S_4 = FR_S_4, FR_S_4, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly_lo = FR_rsq * poly_lo + C_3 +// poly_hi = FR_rsq * poly_hi +// + fma.s1 FR_Z = FR_Z, FR_rsq, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// if (i_1 == 0): +// poly_lo = FR_rsq * S_5 + S_4 +// poly_hi = FR_rsq * S_2 + S_1 +// +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1 == 0): +// Z = Z * r for only one of the small r cases - not there +// in original implementation notes. +// +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fma.d.s0 FR_C_1 = FR_C_1, FR_C_1, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// poly_lo = FR_rsq * poly_lo + S_3 +// poly_hi = FR_rsq * poly_hi +// +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1 == 0): dummy fmpy's to flag inexact +// r = 1 +// +(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// poly_hi = r * poly_hi +// + fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fms.s1 FR_r = f0, f1, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly_hi = Z * poly_lo + c +// if i_0 == 1: r = -r +// + fma.s1 FR_poly = FR_poly, f1, FR_poly_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fms.d.s0 FR_Input_X = FR_r, f1, FR_poly + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// poly = poly + poly_hi +// +(p11) fma.d.s0 FR_Input_X = FR_r, f1, FR_poly +// +// if (i_0 == 0) Result = r + poly +// if (i_0 != 0) Result = r - poly +// + br.ret.sptk b0 ;; +} +L(SINCOS_NORMAL_R): + +{ .mii + nop.m 999 + extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +// +// Set table_ptr1 and table_ptr2 to base address of +// constant table. + cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; +} + +{ .mfi + nop.m 999 + fma.s1 FR_rsq = FR_r, FR_r, f0 + extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} + +{ .mfi + nop.m 999 + frcpa.s1 FR_r_hi, p6 = f1, FR_r + cmp.eq.unc p11, p12 = 0x0, GR_i_0 +} +;; + +// ****************************************************************** +// ****************************************************************** +// ****************************************************************** +// +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// Get [i_0,i_1] - two lsb of N_fix_gr alone. +// + +{ .mmi + nop.m 999 + addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi +(p10) add GR_Table_Base = 384, GR_Table_Base +(p12) fms.s1 FR_Input_X = f0, f1, f1 +(p9) add GR_Table_Base = 224, GR_Table_Base ;; +} + +{ .mmf + nop.m 999 +(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 +// +// if (i_1==0) poly = poly * FR_rsq + PP_1_lo +// else poly = FR_rsq * poly +// +(p11) fma.s1 FR_Input_X = f0, f1, f1 ;; +} + +{ .mmf +(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 +// +// Adjust table pointers based on i_0 +// Compute rsq = r * r +// +(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 + fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;; +} + +{ .mmf +(p9) ldfe FR_PP_7 = [GR_Table_Base], 16 +(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 +// +// Load PP_8 and QQ_8; PP_7 and QQ_7 +// + frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; +} +// +// if (i_1==0) poly = PP_7 + FR_rsq * PP_8. +// else poly = QQ_7 + FR_rsq * QQ_8. +// + +{ .mmb +(p9) ldfe FR_PP_6 = [GR_Table_Base], 16 +(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 + nop.b 999 ;; +} + +{ .mmb +(p9) ldfe FR_PP_5 = [GR_Table_Base], 16 +(p10) ldfe FR_S_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} + +{ .mmb +(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 +(p9) ldfe FR_C_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} + +{ .mmi +(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;; +(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 + nop.i 999 ;; +} + +{ .mmf +(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 +// +// if (i_1=0) corr = corr + c*c +// else corr = corr * c +// +(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 +(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;; +} +// +// if (i_1=0) poly = rsq * poly + PP_5 +// else poly = rsq * poly + QQ_5 +// Load PP_4 or QQ_4 +// + +{ .mmf +(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 +(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 +// +// r_hi = frcpa(frcpa(r)). +// r_cube = r * FR_rsq. +// +(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;; +} +// +// Do dummy multiplies so inexact is always set. +// + +{ .mfi +(p9) ldfe FR_PP_2 = [GR_Table_Base], 16 +// +// r_lo = r - r_hi +// +(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 + nop.i 999 ;; +} + +{ .mmf + nop.m 999 +(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 +(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1=0) U_lo = r_hi * r_hi +// else U_lo = r_hi + r +// +(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1=0) corr = C_1 * rsq +// else corr = S_1 * r_cubed + r +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1=0) U_hi = r_hi + U_hi +// else U_hi = QQ_1 * U_hi + 1 +// +(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// U_hi = r_hi * r_hi +// + fms.s1 FR_r_lo = FR_r, f1, FR_r_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load PP_1, PP_6, PP_5, and C_1 +// Load QQ_1, QQ_6, QQ_5, and S_1 +// + fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1=0) U_lo = r * r_hi + U_lo +// else U_lo = r_lo * U_lo +// +(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// if (i_1 =0) U_hi = r + U_hi +// if (i_1 =0) U_lo = r_lo * U_lo +// +// +(p9) fma.d.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1=0) poly = poly * rsq + PP_6 +// else poly = poly * rsq + QQ_6 +// +(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.d.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1!=0) U_hi = PP_1 * U_hi +// if (i_1!=0) U_lo = r * r + U_lo +// Load PP_3 or QQ_3 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load PP_2, QQ_2 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = FR_rsq * poly + PP_3 +// else poly = FR_rsq * poly + QQ_3 +// Load PP_1_lo +// +(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1 =0) poly = poly * rsq + pp_r4 +// else poly = poly * rsq + qq_r4 +// +(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1==0) U_lo = PP_1_hi * U_lo +// else U_lo = QQ_1 * U_lo +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_0==0) Result = 1 +// else Result = -1 +// + fma.s1 FR_V = FR_U_lo, f1, FR_corr + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = FR_rsq * poly + PP_2 +// else poly = FR_rsq * poly + QQ_2 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// V = U_lo + corr +// +(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = r_cube * poly +// else poly = FR_rsq * poly +// + fma.s1 FR_V = FR_poly, f1, FR_V + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fms.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// V = V + poly +// +(p11) fma.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V +// +// if (i_0==0) Result = Result * U_hi + V +// else Result = Result * U_hi - V +// + br.ret.sptk b0 ;; +} + +// +// If cosine, FR_Input_X = 1 +// If sine, FR_Input_X = +/-Zero (Input FR_Input_X) +// Results are exact, no exceptions +// +L(SINCOS_ZERO): + +{ .mmb + cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos + nop.m 999 + nop.b 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X + nop.i 999 +} + +{ .mfb + nop.m 999 +(p6) fmerge.s FR_Input_X = f1, f1 + br.ret.sptk b0 ;; +} + +L(SINCOS_SPECIAL): + +// +// Path for Arg = +/- QNaN, SNaN, Inf +// Invalid can be raised. SNaNs +// become QNaNs +// + +{ .mfb + nop.m 999 + fmpy.d.s0 FR_Input_X = FR_Input_X, f0 + br.ret.sptk b0 ;; +} +.endp __libm_cos_double_dbx# +ASM_SIZE_DIRECTIVE(__libm_cos_double_dbx#) + + + +// +// Call int pi_by_2_reduce(double* x, double *y) +// for |arguments| >= 2**63 +// Address to save r and c as double +// +// +// psp sp+64 +// sp+48 -> f0 c +// r45 sp+32 -> f0 r +// r44 -> sp+16 -> InputX +// sp sp -> scratch provided to callee + + + +.proc __libm_callout_2 +__libm_callout_2: +L(SINCOS_ARG_TOO_LARGE): + +.prologue +{ .mfi + add r45=-32,sp // Parameter: r address + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [r45] = f0,16 // Clear Parameter r on stack + add r44 = 16,sp // Parameter x address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [r45] = f0,-16 // Clear Parameter c on stack + nop.i 0 + nop.b 0 +} +{ .mib + stfe [r44] = FR_Input_X // Store Parameter x on stack + nop.i 0 + br.call.sptk b0=__libm_pi_by_2_reduce# ;; +};; + + +{ .mii + ldfe FR_Input_X =[r44],16 +// +// Get r and c off stack +// + adds GR_Table_Base1 = -16, GR_Table_Base1 +// +// Get r and c off stack +// + add GR_N_Inc = GR_Sin_or_Cos,r8 ;; +} +{ .mmb + ldfe FR_r =[r45],16 +// +// Get X off the stack +// Readjust Table ptr +// + ldfs FR_Two_to_M3 = [GR_Table_Base1],4 + nop.b 999 ;; +} +{ .mmb + ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 + ldfe FR_c =[r45] + nop.b 999 ;; +} + +{ .mfi +.restore sp + add sp = 64,sp // Restore stack pointer + fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + nop.b 0 +};; + + +{ .mfi + nop.m 999 +(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(SINCOS_SMALL_R) ;; +} + +{ .mib + nop.m 999 + nop.i 999 + br.cond.sptk L(SINCOS_NORMAL_R) ;; +} + +.endp __libm_callout_2 +ASM_SIZE_DIRECTIVE(__libm_callout_2) + +.type __libm_pi_by_2_reduce#,@function +.global __libm_pi_by_2_reduce# + + +.type __libm_sin_double_dbx#,@function +.global __libm_sin_double_dbx# +.type __libm_cos_double_dbx#,@function +.global __libm_cos_double_dbx# diff --git a/sysdeps/ia64/fpu/s_cosf.S b/sysdeps/ia64/fpu/s_cosf.S new file mode 100644 index 0000000000..111d7da671 --- /dev/null +++ b/sysdeps/ia64/fpu/s_cosf.S @@ -0,0 +1,686 @@ + +.file "sincosf.s" + + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. + + +// History +//============================================================== +// 2/02/00 Initial revision +// 4/02/00 Unwind support added. +// 5/10/00 Improved speed with new algorithm. +// 8/08/00 Improved speed by avoiding SIR flush. +// 8/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 8/30/00 Put sin_of_r before sin_tbl_S_cos_of_r to gain a cycle +// 1/02/00 Fixed flag settings, improved speed. +// +// API +//============================================================== +// float sinf( float x); +// float cosf( float x); +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== + +// SIN_Sin_Flag = p6 +// SIN_Cos_Flag = p7 + +// integer registers used + + SIN_AD_PQ_1 = r33 + SIN_AD_PQ_2 = r33 + sin_GR_sincos_flag = r34 + sin_GR_Mint = r35 + + sin_GR_index = r36 + gr_tmp = r37 + + GR_SAVE_B0 = r37 + GR_SAVE_GP = r38 + GR_SAVE_PFS = r39 + + +// floating point registers used + + sin_coeff_P1 = f32 + sin_coeff_P2 = f33 + sin_coeff_Q1 = f34 + sin_coeff_Q2 = f35 + sin_coeff_P4 = f36 + sin_coeff_P5 = f37 + sin_coeff_Q3 = f38 + sin_coeff_Q4 = f39 + sin_Mx = f40 + sin_Mfloat = f41 + sin_tbl_S = f42 + sin_tbl_C = f43 + sin_r = f44 + sin_rcube = f45 + sin_tsq = f46 + sin_r7 = f47 + sin_t = f48 + sin_poly_p2 = f49 + sin_poly_p1 = f50 + fp_tmp = f51 + sin_poly_p3 = f52 + sin_poly_p4 = f53 + sin_of_r = f54 + sin_S_t = f55 + sin_poly_q2 = f56 + sin_poly_q1 = f57 + sin_S_tcube = f58 + sin_poly_q3 = f59 + sin_poly_q4 = f60 + sin_tbl_S_tcube = f61 + sin_tbl_S_cos_of_r = f62 + + sin_coeff_Q5 = f63 + sin_coeff_Q6 = f64 + sin_coeff_P3 = f65 + + sin_poly_q5 = f66 + sin_poly_q12 = f67 + sin_poly_q3456 = f68 + fp_tmp2 = f69 + SIN_NORM_f8 = f70 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +sin_coeff_1_table: +ASM_TYPE_DIRECTIVE(sin_coeff_1_table,@object) +data8 0xBF56C16C16BF6462 // q3 +data8 0x3EFA01A0128B9EBC // q4 +data8 0xBE927E42FDF33FFE // q5 +data8 0x3E21DA5C72A446F3 // q6 +data8 0x3EC71DD1D5E421A4 // p4 +data8 0xBE5AC5C9D0ACF95A // p5 +data8 0xBFC55555555554CA // p1 +data8 0x3F811111110F2395 // p2 +data8 0xBFE0000000000000 // q1 +data8 0x3FA55555555554EF // q2 +data8 0xBF2A01A011232913 // p3 +data8 0x0000000000000000 // pad + + +///////////////////////////////////////// + +data8 0xBFE1A54991426566 //sin(-32) +data8 0x3FEAB1F5305DE8E5 //cos(-32) +data8 0x3FD9DBC0B640FC81 //sin(-31) +data8 0x3FED4591C3E12A20 //cos(-31) +data8 0x3FEF9DF47F1C903D //sin(-30) +data8 0x3FC3BE82F2505A52 //cos(-30) +data8 0x3FE53C7D20A6C9E7 //sin(-29) +data8 0xBFE7F01658314E47 //cos(-29) +data8 0xBFD156853B4514D6 //sin(-28) +data8 0xBFEECDAAD1582500 //cos(-28) +data8 0xBFEE9AA1B0E5BA30 //sin(-27) +data8 0xBFD2B266F959DED5 //cos(-27) +data8 0xBFE866E0FAC32583 //sin(-26) +data8 0x3FE4B3902691A9ED //cos(-26) +data8 0x3FC0F0E6F31E809D //sin(-25) +data8 0x3FEFB7EEF59504FF //cos(-25) +data8 0x3FECFA7F7919140F //sin(-24) +data8 0x3FDB25BFB50A609A //cos(-24) +data8 0x3FEB143CD0247D02 //sin(-23) +data8 0xBFE10CF7D591F272 //cos(-23) +data8 0x3F8220A29F6EB9F4 //sin(-22) +data8 0xBFEFFFADD8D4ACDA //cos(-22) +data8 0xBFEAC5E20BB0D7ED //sin(-21) +data8 0xBFE186FF83773759 //cos(-21) +data8 0xBFED36D8F55D3CE0 //sin(-20) +data8 0x3FDA1E043964A83F //cos(-20) +data8 0xBFC32F2D28F584CF //sin(-19) +data8 0x3FEFA377DE108258 //cos(-19) +data8 0x3FE8081668131E26 //sin(-18) +data8 0x3FE52150815D2470 //cos(-18) +data8 0x3FEEC3C4AC42882B //sin(-17) +data8 0xBFD19C46B07F58E7 //cos(-17) +data8 0x3FD26D02085F20F8 //sin(-16) +data8 0xBFEEA5257E962F74 //cos(-16) +data8 0xBFE4CF2871CEC2E8 //sin(-15) +data8 0xBFE84F5D069CA4F3 //cos(-15) +data8 0xBFEFB30E327C5E45 //sin(-14) +data8 0x3FC1809AEC2CA0ED //cos(-14) +data8 0xBFDAE4044881C506 //sin(-13) +data8 0x3FED09CDD5260CB7 //cos(-13) +data8 0x3FE12B9AF7D765A5 //sin(-12) +data8 0x3FEB00DA046B65E3 //cos(-12) +data8 0x3FEFFFEB762E93EB //sin(-11) +data8 0x3F7220AE41EE2FDF //cos(-11) +data8 0x3FE1689EF5F34F52 //sin(-10) +data8 0xBFEAD9AC890C6B1F //cos(-10) +data8 0xBFDA6026360C2F91 //sin( -9) +data8 0xBFED27FAA6A6196B //cos( -9) +data8 0xBFEFA8D2A028CF7B //sin( -8) +data8 0xBFC29FBEBF632F94 //cos( -8) +data8 0xBFE50608C26D0A08 //sin( -7) +data8 0x3FE81FF79ED92017 //cos( -7) +data8 0x3FD1E1F18AB0A2C0 //sin( -6) +data8 0x3FEEB9B7097822F5 //cos( -6) +data8 0x3FEEAF81F5E09933 //sin( -5) +data8 0x3FD22785706B4AD9 //cos( -5) +data8 0x3FE837B9DDDC1EAE //sin( -4) +data8 0xBFE4EAA606DB24C1 //cos( -4) +data8 0xBFC210386DB6D55B //sin( -3) +data8 0xBFEFAE04BE85E5D2 //cos( -3) +data8 0xBFED18F6EAD1B446 //sin( -2) +data8 0xBFDAA22657537205 //cos( -2) +data8 0xBFEAED548F090CEE //sin( -1) +data8 0x3FE14A280FB5068C //cos( -1) +data8 0x0000000000000000 //sin( 0) +data8 0x3FF0000000000000 //cos( 0) +data8 0x3FEAED548F090CEE //sin( 1) +data8 0x3FE14A280FB5068C //cos( 1) +data8 0x3FED18F6EAD1B446 //sin( 2) +data8 0xBFDAA22657537205 //cos( 2) +data8 0x3FC210386DB6D55B //sin( 3) +data8 0xBFEFAE04BE85E5D2 //cos( 3) +data8 0xBFE837B9DDDC1EAE //sin( 4) +data8 0xBFE4EAA606DB24C1 //cos( 4) +data8 0xBFEEAF81F5E09933 //sin( 5) +data8 0x3FD22785706B4AD9 //cos( 5) +data8 0xBFD1E1F18AB0A2C0 //sin( 6) +data8 0x3FEEB9B7097822F5 //cos( 6) +data8 0x3FE50608C26D0A08 //sin( 7) +data8 0x3FE81FF79ED92017 //cos( 7) +data8 0x3FEFA8D2A028CF7B //sin( 8) +data8 0xBFC29FBEBF632F94 //cos( 8) +data8 0x3FDA6026360C2F91 //sin( 9) +data8 0xBFED27FAA6A6196B //cos( 9) +data8 0xBFE1689EF5F34F52 //sin( 10) +data8 0xBFEAD9AC890C6B1F //cos( 10) +data8 0xBFEFFFEB762E93EB //sin( 11) +data8 0x3F7220AE41EE2FDF //cos( 11) +data8 0xBFE12B9AF7D765A5 //sin( 12) +data8 0x3FEB00DA046B65E3 //cos( 12) +data8 0x3FDAE4044881C506 //sin( 13) +data8 0x3FED09CDD5260CB7 //cos( 13) +data8 0x3FEFB30E327C5E45 //sin( 14) +data8 0x3FC1809AEC2CA0ED //cos( 14) +data8 0x3FE4CF2871CEC2E8 //sin( 15) +data8 0xBFE84F5D069CA4F3 //cos( 15) +data8 0xBFD26D02085F20F8 //sin( 16) +data8 0xBFEEA5257E962F74 //cos( 16) +data8 0xBFEEC3C4AC42882B //sin( 17) +data8 0xBFD19C46B07F58E7 //cos( 17) +data8 0xBFE8081668131E26 //sin( 18) +data8 0x3FE52150815D2470 //cos( 18) +data8 0x3FC32F2D28F584CF //sin( 19) +data8 0x3FEFA377DE108258 //cos( 19) +data8 0x3FED36D8F55D3CE0 //sin( 20) +data8 0x3FDA1E043964A83F //cos( 20) +data8 0x3FEAC5E20BB0D7ED //sin( 21) +data8 0xBFE186FF83773759 //cos( 21) +data8 0xBF8220A29F6EB9F4 //sin( 22) +data8 0xBFEFFFADD8D4ACDA //cos( 22) +data8 0xBFEB143CD0247D02 //sin( 23) +data8 0xBFE10CF7D591F272 //cos( 23) +data8 0xBFECFA7F7919140F //sin( 24) +data8 0x3FDB25BFB50A609A //cos( 24) +data8 0xBFC0F0E6F31E809D //sin( 25) +data8 0x3FEFB7EEF59504FF //cos( 25) +data8 0x3FE866E0FAC32583 //sin( 26) +data8 0x3FE4B3902691A9ED //cos( 26) +data8 0x3FEE9AA1B0E5BA30 //sin( 27) +data8 0xBFD2B266F959DED5 //cos( 27) +data8 0x3FD156853B4514D6 //sin( 28) +data8 0xBFEECDAAD1582500 //cos( 28) +data8 0xBFE53C7D20A6C9E7 //sin( 29) +data8 0xBFE7F01658314E47 //cos( 29) +data8 0xBFEF9DF47F1C903D //sin( 30) +data8 0x3FC3BE82F2505A52 //cos( 30) +data8 0xBFD9DBC0B640FC81 //sin( 31) +data8 0x3FED4591C3E12A20 //cos( 31) +data8 0x3FE1A54991426566 //sin( 32) +data8 0x3FEAB1F5305DE8E5 //cos( 32) +ASM_SIZE_DIRECTIVE(sin_coeff_1_table) + +////////////////////////////////////////// + + +.global sinf +.global cosf +#ifdef _LIBC +.global __sinf +.global __cosf +#endif + +.text +.proc cosf +#ifdef _LIBC +.proc __cosf +#endif +.align 32 + + +cosf: +#ifdef _LIBC +__cosf: +#endif +{ .mfi + alloc r32 = ar.pfs,1,7,0,0 + fcvt.fx.s1 sin_Mx = f8 + cmp.ne p6,p7 = r0,r0 // p7 set if cos +} +{ .mfi + addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp + fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid + mov sin_GR_sincos_flag = 0x0 +} +;; + +{ .mfi + ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1] + fclass.m.unc p9,p0 = f8, 0x07 + cmp.ne p8,p0 = r0,r0 +} +{ .mfb + nop.m 999 + nop.f 999 + br.sptk L(SINCOSF_COMMON) +} +;; + +.endp cosf +ASM_SIZE_DIRECTIVE(cosf) + + +.text +.proc sinf +#ifdef _LIBC +.proc __sinf +#endif +.align 32 + +sinf: +#ifdef _LIBC +__sinf: +#endif +{ .mfi + alloc r32 = ar.pfs,1,7,0,0 + fcvt.fx.s1 sin_Mx = f8 + cmp.eq p6,p7 = r0,r0 // p6 set if sin +} +{ .mfi + addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp + fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid + mov sin_GR_sincos_flag = 0x1 +} +;; + +{ .mfi + ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1] + fclass.m.unc p8,p0 = f8, 0x07 + cmp.ne p9,p0 = r0,r0 +} +{ .mfb + nop.m 999 + nop.f 999 + br.sptk L(SINCOSF_COMMON) +} +;; + + +L(SINCOSF_COMMON): + +// Here with p6 if sin, p7 if cos, p8 if sin(0), p9 if cos(0) + + +{ .mmf + ldfpd sin_coeff_Q3, sin_coeff_Q4 = [SIN_AD_PQ_1], 16 + nop.m 999 + fclass.m.unc p11,p0 = f8, 0x23 // Test for x=inf +} +;; + +{ .mfb + ldfpd sin_coeff_Q5, sin_coeff_Q6 = [SIN_AD_PQ_1], 16 + fclass.m.unc p10,p0 = f8, 0xc3 // Test for x=nan +(p8) br.ret.spnt b0 // Exit for sin(0) +} +{ .mfb + nop.m 999 +(p9) fma.s f8 = f1,f1,f0 +(p9) br.ret.spnt b0 // Exit for cos(0) +} +;; + +{ .mmf + ldfpd sin_coeff_P4, sin_coeff_P5 = [SIN_AD_PQ_1], 16 + addl gr_tmp = -1,r0 + fcvt.xf sin_Mfloat = sin_Mx +} +;; + +{ .mfi + getf.sig sin_GR_Mint = sin_Mx +(p11) frcpa.s0 f8,p13 = f0,f0 // qnan indef if x=inf + nop.i 999 +} +{ .mfb + ldfpd sin_coeff_P1, sin_coeff_P2 = [SIN_AD_PQ_1], 16 + nop.f 999 +(p11) br.ret.spnt b0 // Exit for x=inf +} +;; + +{ .mfi + ldfpd sin_coeff_Q1, sin_coeff_Q2 = [SIN_AD_PQ_1], 16 + nop.f 999 + cmp.ge p8,p9 = -33,sin_GR_Mint +} +{ .mfb + add sin_GR_index = 32,sin_GR_Mint +(p10) fma.s f8 = f8,f1,f0 // Force qnan if x=nan +(p10) br.ret.spnt b0 // Exit for x=nan +} +;; + +{ .mmi + ldfd sin_coeff_P3 = [SIN_AD_PQ_1], 16 +(p9) cmp.le p8,p0 = 33, sin_GR_Mint + shl sin_GR_index = sin_GR_index,4 +} +;; + + +{ .mfi + setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact + fnma.s1 sin_r = f1,sin_Mfloat,SIN_NORM_f8 +(p8) cmp.eq.unc p11,p12=sin_GR_sincos_flag,r0 // p11 if must call dbl cos + // p12 if must call dbl sin +} +{ .mbb + add SIN_AD_PQ_2 = sin_GR_index,SIN_AD_PQ_1 +(p11) br.cond.spnt COS_DOUBLE +(p12) br.cond.spnt SIN_DOUBLE +} +;; + +.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag +{ .mmi +(p6) ldfpd sin_tbl_S,sin_tbl_C = [SIN_AD_PQ_2] +(p7) ldfpd sin_tbl_C,sin_tbl_S = [SIN_AD_PQ_2] + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p6) fclass.m.unc p8,p0 = f8, 0x0b // If sin, note denormal input to set uflow + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_t = sin_r,sin_r,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_rcube = sin_t,sin_r,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_tsq = sin_t,sin_t,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_poly_q3 = sin_t,sin_coeff_Q4,sin_coeff_Q3 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_poly_q5 = sin_t,sin_coeff_Q6,sin_coeff_Q5 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_poly_p1 = sin_t,sin_coeff_P5,sin_coeff_P4 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_poly_p2 = sin_t,sin_coeff_P2,sin_coeff_P1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_poly_q1 = sin_t,sin_coeff_Q2,sin_coeff_Q1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_S_t = sin_t,sin_tbl_S,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p8) fmpy.s.s0 fp_tmp2 = f8,f8 // Dummy mult to set underflow if sin(denormal) + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_r7 = sin_rcube,sin_tsq,f0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_poly_q3456 = sin_tsq,sin_poly_q5,sin_poly_q3 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_poly_p3 = sin_t,sin_poly_p1,sin_coeff_P3 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_poly_p4 = sin_rcube,sin_poly_p2,sin_r + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.s1 sin_tbl_S_tcube = sin_S_t,sin_tsq,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 sin_poly_q12 = sin_S_t,sin_poly_q1,sin_tbl_S + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.d.s1 sin_of_r = sin_r7,sin_poly_p3,sin_poly_p4 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 + fma.d.s1 sin_tbl_S_cos_of_r = sin_tbl_S_tcube,sin_poly_q3456,sin_poly_q12 + nop.i 999 +} +{ .mfi + nop.m 999 + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 +} +;; + + +.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag +{ .mfi + nop.m 999 +//(SIN_Sin_Flag) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r +(p6) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r + nop.i 999 +} +{ .mfb + nop.m 999 +//(SIN_Cos_Flag) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r +(p7) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r + br.ret.sptk b0 +} + +.endp sinf +ASM_SIZE_DIRECTIVE(sinf) + + +.proc SIN_DOUBLE +SIN_DOUBLE: +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mmb + nop.m 999 + nop.m 999 + br.call.sptk.many b0=sin +} +;; + +{ .mfi + mov gp = GR_SAVE_GP + nop.f 999 + mov b0 = GR_SAVE_B0 +} +;; + +{ .mfi + nop.m 999 + fma.s f8 = f8,f1,f0 +(p0) mov ar.pfs = GR_SAVE_PFS +} +{ .mib + nop.m 999 + nop.i 999 +(p0) br.ret.sptk b0 +} +;; + +.endp SIN_DOUBLE +ASM_SIZE_DIRECTIVE(SIN_DOUBLE) + + +.proc COS_DOUBLE +COS_DOUBLE: +.prologue +{ .mfi + nop.m 0 + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mmb + nop.m 999 + nop.m 999 + br.call.sptk.many b0=cos +} +;; + +{ .mfi + mov gp = GR_SAVE_GP + nop.f 999 + mov b0 = GR_SAVE_B0 +} +;; + +{ .mfi + nop.m 999 + fma.s f8 = f8,f1,f0 +(p0) mov ar.pfs = GR_SAVE_PFS +} +{ .mib + nop.m 999 + nop.i 999 +(p0) br.ret.sptk b0 +} +;; + +.endp COS_DOUBLE +ASM_SIZE_DIRECTIVE(COS_DOUBLE) + + + +.type sin,@function +.global sin +.type cos,@function +.global cos diff --git a/sysdeps/ia64/fpu/s_cosl.S b/sysdeps/ia64/fpu/s_cosl.S new file mode 100644 index 0000000000..a14ef5bc8f --- /dev/null +++ b/sysdeps/ia64/fpu/s_cosl.S @@ -0,0 +1,2506 @@ +.file "sincosl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 2/02/2000 (hand-optimized) +// 4/04/00 Unwind support added +// +// ********************************************************************* +// +// Function: Combined sinl(x) and cosl(x), where +// +// sinl(x) = sine(x), for double-extended precision x values +// cosl(x) = cosine(x), for double-extended precision x values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f32-f99 +// +// General Purpose Registers: +// r32-r43 +// r44-r45 (Used to pass arguments to pi_by_2 reduce routine) +// +// Predicate Registers: p6-p13 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions do not occur +// Underflow exceptions raised when appropriate for sin +// (No specialized error handling for this routine) +// Inexact raised when appropriate by algorithm +// +// sinl(SNaN) = QNaN +// sinl(QNaN) = QNaN +// sinl(inf) = QNaN +// sinl(+/-0) = +/-0 +// cosl(inf) = QNaN +// cosl(SNaN) = QNaN +// cosl(QNaN) = QNaN +// cosl(0) = 1 +// +// ********************************************************************* +// +// Mathematical Description +// ======================== +// +// The computation of FSIN and FCOS is best handled in one piece of +// code. The main reason is that given any argument Arg, computation +// of trigonometric functions first calculate N and an approximation +// to alpha where +// +// Arg = N pi/2 + alpha, |alpha| <= pi/4. +// +// Since +// +// cosl( Arg ) = sinl( (N+1) pi/2 + alpha ), +// +// therefore, the code for computing sine will produce cosine as long +// as 1 is added to N immediately after the argument reduction +// process. +// +// Let M = N if sine +// N+1 if cosine. +// +// Now, given +// +// Arg = M pi/2 + alpha, |alpha| <= pi/4, +// +// let I = M mod 4, or I be the two lsb of M when M is represented +// as 2's complement. I = [i_0 i_1]. Then +// +// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0, +// = (-1)^i_0 cosl( alpha ) if i_1 = 1. +// +// For example: +// if M = -1, I = 11 +// sin ((-pi/2 + alpha) = (-1) cos (alpha) +// if M = 0, I = 00 +// sin (alpha) = sin (alpha) +// if M = 1, I = 01 +// sin (pi/2 + alpha) = cos (alpha) +// if M = 2, I = 10 +// sin (pi + alpha) = (-1) sin (alpha) +// if M = 3, I = 11 +// sin ((3/2)pi + alpha) = (-1) cos (alpha) +// +// The value of alpha is obtained by argument reduction and +// represented by two working precision numbers r and c where +// +// alpha = r + c accurately. +// +// The reduction method is described in a previous write up. +// The argument reduction scheme identifies 4 cases. For Cases 2 +// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be +// computed very easily by 2 or 3 terms of the Taylor series +// expansion as follows: +// +// Case 2: +// ------- +// +// sinl(r + c) = r + c - r^3/6 accurately +// cosl(r + c) = 1 - 2^(-67) accurately +// +// Case 4: +// ------- +// +// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately +// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately +// +// The only cases left are Cases 1 and 3 of the argument reduction +// procedure. These two cases will be merged since after the +// argument is reduced in either cases, we have the reduced argument +// represented as r + c and that the magnitude |r + c| is not small +// enough to allow the usage of a very short approximation. +// +// The required calculation is either +// +// sinl(r + c) = sinl(r) + correction, or +// cosl(r + c) = cosl(r) + correction. +// +// Specifically, +// +// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2) +// = sinl(r) + c cos (r) + O(c^2) +// = sinl(r) + c(1 - r^2/2) accurately. +// Similarly, +// +// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2) +// = cosl(r) - c(r - r^3/6) accurately. +// +// We therefore concentrate on accurately calculating sinl(r) and +// cosl(r) for a working-precision number r, |r| <= pi/4 to within +// 0.1% or so. +// +// The greatest challenge of this task is that the second terms of +// the Taylor series +// +// r - r^3/3! + r^r/5! - ... +// +// and +// +// 1 - r^2/2! + r^4/4! - ... +// +// are not very small when |r| is close to pi/4 and the rounding +// errors will be a concern if simple polynomial accumulation is +// used. When |r| < 2^-3, however, the second terms will be small +// enough (6 bits or so of right shift) that a normal Horner +// recurrence suffices. Hence there are two cases that we consider +// in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4. +// +// Case small_r: |r| < 2^(-3) +// -------------------------- +// +// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], +// we have +// +// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 +// = (-1)^i_0 * cosl(r + c) if i_1 = 1 +// +// can be accurately approximated by +// +// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0 +// = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1 +// +// because |r| is small and thus the second terms in the correction +// are unneccessary. +// +// Finally, sinl(r) and cosl(r) are approximated by polynomials of +// moderate lengths. +// +// sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 +// cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 +// +// We can make use of predicates to selectively calculate +// sinl(r) or cosl(r) based on i_1. +// +// Case normal_r: 2^(-3) <= |r| <= pi/4 +// ------------------------------------ +// +// This case is more likely than the previous one if one considers +// r to be uniformly distributed in [-pi/4 pi/4]. Again, +// +// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 +// = (-1)^i_0 * cosl(r + c) if i_1 = 1. +// +// Because |r| is now larger, we need one extra term in the +// correction. sinl(Arg) can be accurately approximated by +// +// sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0 +// = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1. +// +// Finally, sinl(r) and cosl(r) are approximated by polynomials of +// moderate lengths. +// +// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + +// PP_2 r^5 + ... + PP_8 r^17 +// +// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 +// +// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. +// The crux in accurate computation is to calculate +// +// r + PP_1_hi r^3 or 1 + QQ_1 r^2 +// +// accurately as two pieces: U_hi and U_lo. The way to achieve this +// is to obtain r_hi as a 10 sig. bit number that approximates r to +// roughly 8 bits or so of accuracy. (One convenient way is +// +// r_hi := frcpa( frcpa( r ) ).) +// +// This way, +// +// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + +// PP_1_hi (r^3 - r_hi^3) +// = [r + PP_1_hi r_hi^3] + +// [PP_1_hi (r - r_hi) +// (r^2 + r_hi r + r_hi^2) ] +// = U_hi + U_lo +// +// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, +// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed +// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign +// and that there is no more than 8 bit shift off between r and +// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus +// calculated without any error. Finally, the fact that +// +// |U_lo| <= 2^(-8) |U_hi| +// +// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly +// 8 extra bits of accuracy. +// +// Similarly, +// +// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + +// [QQ_1 (r - r_hi)(r + r_hi)] +// = U_hi + U_lo. +// +// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). +// +// If i_1 = 0, then +// +// U_hi := r + PP_1_hi * r_hi^3 +// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2) +// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17 +// correction := c * ( 1 + C_1 r^2 ) +// +// Else ...i_1 = 1 +// +// U_hi := 1 + QQ_1 * r_hi * r_hi +// U_lo := QQ_1 * (r - r_hi) * (r + r_hi) +// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16 +// correction := -c * r * (1 + S_1 * r^2) +// +// End +// +// Finally, +// +// V := poly + ( U_lo + correction ) +// +// / U_hi + V if i_0 = 0 +// result := | +// \ (-U_hi) - V if i_0 = 1 +// +// It is important that in the last step, negation of U_hi is +// performed prior to the subtraction which is to be performed in +// the user-set rounding mode. +// +// +// Algorithmic Description +// ======================= +// +// The argument reduction algorithm is tightly integrated into FSIN +// and FCOS which share the same code. The following is complete and +// self-contained. The argument reduction description given +// previously is repeated below. +// +// +// Step 0. Initialization. +// +// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, +// set N_inc := 1. +// +// Step 1. Check for exceptional and special cases. +// +// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special +// handling. +// * If |Arg| < 2^24, go to Step 2 for reduction of moderate +// arguments. This is the most likely case. +// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large +// arguments. +// * If |Arg| >= 2^63, go to Step 10 for special handling. +// +// Step 2. Reduction of moderate arguments. +// +// If |Arg| < pi/4 ...quick branch +// N_fix := N_inc (integer) +// r := Arg +// c := 0.0 +// Branch to Step 4, Case_1_complete +// Else ...cf. argument reduction +// N := Arg * two_by_PI (fp) +// N_fix := fcvt.fx( N ) (int) +// N := fcvt.xf( N_fix ) +// N_fix := N_fix + N_inc +// s := Arg - N * P_1 (first piece of pi/2) +// w := -N * P_2 (second piece of pi/2) +// +// If |s| >= 2^(-33) +// go to Step 3, Case_1_reduce +// Else +// go to Step 7, Case_2_reduce +// Endif +// Endif +// +// Step 3. Case_1_reduce. +// +// r := s + w +// c := (s - r) + w ...observe order +// +// Step 4. Case_1_complete +// +// ...At this point, the reduced argument alpha is +// ...accurately represented as r + c. +// If |r| < 2^(-3), go to Step 6, small_r. +// +// Step 5. Normal_r. +// +// Let [i_0 i_1] by the 2 lsb of N_fix. +// FR_rsq := r * r +// r_hi := frcpa( frcpa( r ) ) +// r_lo := r - r_hi +// +// If i_1 = 0, then +// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) +// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order +// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) +// correction := c + c*C_1*FR_rsq ...any order +// Else +// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) +// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order +// U_lo := QQ_1 * r_lo * (r + r_hi) +// correction := -c*(r + S_1*FR_rsq*r) ...any order +// Endif +// +// V := poly + (U_lo + correction) ...observe order +// +// result := (i_0 == 0? 1.0 : -1.0) +// +// Last instruction in user-set rounding mode +// +// result := (i_0 == 0? result*U_hi + V : +// result*U_hi - V) +// +// Return +// +// Step 6. Small_r. +// +// ...Use flush to zero mode without causing exception +// Let [i_0 i_1] be the two lsb of N_fix. +// +// FR_rsq := r * r +// +// If i_1 = 0 then +// z := FR_rsq*FR_rsq; z := FR_rsq*z *r +// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5) +// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2) +// correction := c +// result := r +// Else +// z := FR_rsq*FR_rsq; z := FR_rsq*z +// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) +// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) +// correction := -c*r +// result := 1 +// Endif +// +// poly := poly_hi + (z * poly_lo + correction) +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// Return +// +// Step 7. Case_2_reduce. +// +// ...Refer to the write up for argument reduction for +// ...rationale. The reduction algorithm below is taken from +// ...argument reduction description and integrated this. +// +// w := N*P_3 +// U_1 := N*P_2 + w ...FMA +// U_2 := (N*P_2 - U_1) + w ...2 FMA +// ...U_1 + U_2 is N*(P_2+P_3) accurately +// +// r := s - U_1 +// c := ( (s - r) - U_1 ) - U_2 +// +// ...The mathematical sum r + c approximates the reduced +// ...argument accurately. Note that although compared to +// ...Case 1, this case requires much more work to reduce +// ...the argument, the subsequent calculation needed for +// ...any of the trigonometric function is very little because +// ...|alpha| < 1.01*2^(-33) and thus two terms of the +// ...Taylor series expansion suffices. +// +// If i_1 = 0 then +// poly := c + S_1 * r * r * r ...any order +// result := r +// Else +// poly := -2^(-67) +// result := 1.0 +// Endif +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// +// Return +// +// +// Step 8. Pre-reduction of large arguments. +// +// ...Again, the following reduction procedure was described +// ...in the separate write up for argument reduction, which +// ...is tightly integrated here. + +// N_0 := Arg * Inv_P_0 +// N_0_fix := fcvt.fx( N_0 ) +// N_0 := fcvt.xf( N_0_fix) + +// Arg' := Arg - N_0 * P_0 +// w := N_0 * d_1 +// N := Arg' * two_by_PI +// N_fix := fcvt.fx( N ) +// N := fcvt.xf( N_fix ) +// N_fix := N_fix + N_inc +// +// s := Arg' - N * P_1 +// w := w - N * P_2 +// +// If |s| >= 2^(-14) +// go to Step 3 +// Else +// go to Step 9 +// Endif +// +// Step 9. Case_4_reduce. +// +// ...first obtain N_0*d_1 and -N*P_2 accurately +// U_hi := N_0 * d_1 V_hi := -N*P_2 +// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs +// +// ...compute the contribution from N_0*d_1 and -N*P_3 +// w := -N*P_3 +// w := w + N_0*d_2 +// t := U_lo + V_lo + w ...any order +// +// ...at this point, the mathematical value +// ...s + U_hi + V_hi + t approximates the true reduced argument +// ...accurately. Just need to compute this accurately. +// +// ...Calculate U_hi + V_hi accurately: +// A := U_hi + V_hi +// if |U_hi| >= |V_hi| then +// a := (U_hi - A) + V_hi +// else +// a := (V_hi - A) + U_hi +// endif +// ...order in computing "a" must be observed. This branch is +// ...best implemented by predicates. +// ...A + a is U_hi + V_hi accurately. Moreover, "a" is +// ...much smaller than A: |a| <= (1/2)ulp(A). +// +// ...Just need to calculate s + A + a + t +// C_hi := s + A t := t + a +// C_lo := (s - C_hi) + A +// C_lo := C_lo + t +// +// ...Final steps for reduction +// r := C_hi + C_lo +// c := (C_hi - r) + C_lo +// +// ...At this point, we have r and c +// ...And all we need is a couple of terms of the corresponding +// ...Taylor series. +// +// If i_1 = 0 +// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2) +// result := r +// Else +// poly := FR_rsq*(C_1 + FR_rsq*C_2) +// result := 1 +// Endif +// +// If i_0 = 1, result := -result +// +// Last operation. Perform in user-set rounding mode +// +// result := (i_0 == 0? result + poly : +// result - poly ) +// Return +// +// Large Arguments: For arguments above 2**63, a Payne-Hanek +// style argument reduction is used and pi_by_2 reduce is called. +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 64 + +FSINCOSL_CONSTANTS: +ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object) +data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24 +data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2 +data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0 +data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1 +data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2 +data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3 +data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63 +data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0 +data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1 +data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2 +data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4 +data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4 +data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3 +data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67 +data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8 +data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7 +data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6 +data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5 +data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 +data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi +data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4 +data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3 +data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2 +data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo +data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8 +data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7 +data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6 +data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5 +data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 +data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1 +data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4 +data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3 +data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2 +data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 +data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2 +data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3 +data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4 +data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5 +data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 +data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2 +data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3 +data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4 +data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5 +data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14 +ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS) + +FR_Input_X = f8 +FR_Neg_Two_to_M3 = f32 +FR_Two_to_63 = f32 +FR_Two_to_24 = f33 +FR_Pi_by_4 = f33 +FR_Two_to_M14 = f34 +FR_Two_to_M33 = f35 +FR_Neg_Two_to_24 = f36 +FR_Neg_Pi_by_4 = f36 +FR_Neg_Two_to_M14 = f37 +FR_Neg_Two_to_M33 = f38 +FR_Neg_Two_to_M67 = f39 +FR_Inv_pi_by_2 = f40 +FR_N_float = f41 +FR_N_fix = f42 +FR_P_1 = f43 +FR_P_2 = f44 +FR_P_3 = f45 +FR_s = f46 +FR_w = f47 +FR_c = f48 +FR_r = f49 +FR_Z = f50 +FR_A = f51 +FR_a = f52 +FR_t = f53 +FR_U_1 = f54 +FR_U_2 = f55 +FR_C_1 = f56 +FR_C_2 = f57 +FR_C_3 = f58 +FR_C_4 = f59 +FR_C_5 = f60 +FR_S_1 = f61 +FR_S_2 = f62 +FR_S_3 = f63 +FR_S_4 = f64 +FR_S_5 = f65 +FR_poly_hi = f66 +FR_poly_lo = f67 +FR_r_hi = f68 +FR_r_lo = f69 +FR_rsq = f70 +FR_r_cubed = f71 +FR_C_hi = f72 +FR_N_0 = f73 +FR_d_1 = f74 +FR_V = f75 +FR_V_hi = f75 +FR_V_lo = f76 +FR_U_hi = f77 +FR_U_lo = f78 +FR_U_hiabs = f79 +FR_V_hiabs = f80 +FR_PP_8 = f81 +FR_QQ_8 = f81 +FR_PP_7 = f82 +FR_QQ_7 = f82 +FR_PP_6 = f83 +FR_QQ_6 = f83 +FR_PP_5 = f84 +FR_QQ_5 = f84 +FR_PP_4 = f85 +FR_QQ_4 = f85 +FR_PP_3 = f86 +FR_QQ_3 = f86 +FR_PP_2 = f87 +FR_QQ_2 = f87 +FR_QQ_1 = f88 +FR_N_0_fix = f89 +FR_Inv_P_0 = f90 +FR_corr = f91 +FR_poly = f92 +FR_d_2 = f93 +FR_Two_to_M3 = f94 +FR_Neg_Two_to_63 = f94 +FR_P_0 = f95 +FR_C_lo = f96 +FR_PP_1 = f97 +FR_PP_1_lo = f98 +FR_ArgPrime = f99 + +GR_Table_Base = r32 +GR_Table_Base1 = r33 +GR_i_0 = r34 +GR_i_1 = r35 +GR_N_Inc = r36 +GR_Sin_or_Cos = r37 + +// Added for unwind support + +GR_SAVE_B0 = r39 +GR_SAVE_GP = r40 +GR_SAVE_PFS = r41 + + +.global sinl# +.global cosl# +#ifdef _LIBC +.global __sinl# +.global __cosl# +#endif + +.section .text +.proc sinl# +#ifdef _LIBC +.proc __sinl# +#endif +.align 64 +sinl: +#ifdef _LIBC +__sinl: +#endif +{ .mlx +alloc GR_Table_Base = ar.pfs,0,12,2,0 +(p0) movl GR_Sin_or_Cos = 0x0 ;; +} + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmb + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 +(p0) br.cond.sptk L(SINCOSL_CONTINUE) ;; +} +;; + + +.endp sinl# +ASM_SIZE_DIRECTIVE(sinl#) + +.section .text +.proc cosl# +cosl: +#ifdef _LIBC +.proc __cosl# +__cosl: +#endif +{ .mlx +alloc GR_Table_Base= ar.pfs,0,12,2,0 +(p0) movl GR_Sin_or_Cos = 0x1 ;; +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmb + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.b 999 +} +;; + + + +// +// Load Table Address +// + +L(SINCOSL_CONTINUE): +{ .mmi +(p0) add GR_Table_Base1 = 96, GR_Table_Base +(p0) ldfs FR_Two_to_24 = [GR_Table_Base], 4 +// GR_Sin_or_Cos denotes +(p0) mov r39 = b0 ;; +} +{ .mmi + nop.m 0 +// +// Load 2**24, load 2**63. +// +(p0) ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 + nop.i 0 +} +{ .mfi +(p0) ldfs FR_Two_to_63 = [GR_Table_Base1], 4 +// +// Check for unnormals - unsupported operands. We do not want +// to generate denormal exception +// Check for NatVals, QNaNs, SNaNs, +/-Infs +// Check for EM unsupporteds +// Check for Zero +// +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 + nop.i 0 +};; +{ .mmf + nop.m 999 +(p0) ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 +(p0) fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF +} +{ .mfb + nop.m 999 +(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x007 +(p6) br.cond.spnt L(SINCOSL_SPECIAL) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(SINCOSL_SPECIAL) ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Branch if +/- NaN, Inf. +// Load -2**24, load -2**63. +// +(p10) br.cond.spnt L(SINCOSL_ZERO) ;; +} +{ .mmb +(p0) ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 +(p0) ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 + nop.b 999 ;; +} +{ .mmb +(p0) ldfe FR_d_1 = [GR_Table_Base1], 16 +// +// Raise possible denormal operand flag with useful fcmp +// Is x <= -2**63 +// Load Inv_P_0 for pre-reduction +// Load Inv_pi_by_2 +// +(p0) ldfe FR_P_0 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mmb +(p0) ldfe FR_d_2 = [GR_Table_Base1], 16 +// +// Load P_0 +// Load d_1 +// Is x >= 2**63 +// Is x <= -2**24? +// +(p0) ldfe FR_P_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} +// +// Load P_1 +// Load d_2 +// Is x >= 2**24? +// +{ .mfi +(p0) ldfe FR_P_2 = [GR_Table_Base], 16 +(p0) fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 + nop.i 999 ;; +} +{ .mbb +(p0) ldfe FR_P_3 = [GR_Table_Base], 16 + nop.b 999 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 + nop.i 999 +} +{ .mfi +(p0) ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 +// +// Branch if +/- zero. +// Decide about the paths to take: +// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 +// OTHERWISE - CASE 3 OR 4 +// +(p0) fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 + nop.i 999 ;; +} +{ .mmi +(p0) ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; +(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 + nop.i 999 ;; +} +{ .mib +(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 + nop.i 999 +// +// Load P_2 +// Load P_3 +// Load pi_by_4 +// Load neg_pi_by_4 +// Load 2**(-3) +// Load -2**(-3). +// +(p10) br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Branch out if x >= 2**63. Use Payne-Hanek Reduction +// +(p7) br.cond.spnt L(SINCOSL_LARGER_ARG) ;; +} +{ .mfi + nop.m 999 +// +// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. +// +(p0) fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Select the case when |Arg| < pi/4 +// Else Select the case when |Arg| >= pi/4 +// +(p0) fcvt.fx.s1 FR_N_fix = FR_N_float + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N = Arg * 2/pi +// Check if Arg < pi/4 +// +(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 + nop.i 999 ;; +} +// +// Case 2: Convert integer N_fix back to normalized floating-point value. +// Case 1: p8 is only affected when p6 is set +// +{ .mfi +(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 +// +// Grab the integer part of N and call it N_fix +// +(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X +// If |x| < pi/4, r = x and c = 0 +// lf |x| < pi/4, is x < 2**(-3). +// r = Arg +// c = 0 +(p6) mov GR_N_Inc = GR_Sin_or_Cos ;; +} +{ .mmf + nop.m 999 +(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 +(p6) fmerge.se FR_c = f0, f0 +} +{ .mfi + nop.m 999 +(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. +// If |x| >= pi/4, +// Create the right N for |x| < pi/4 and otherwise +// Case 2: Place integer part of N in GP register +// +(p7) fcvt.xf FR_N_float = FR_N_fix + nop.i 999 ;; +} +{ .mmf + nop.m 999 +(p7) getf.sig GR_N_Inc = FR_N_fix +(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Load 2**(-33), -2**(-33) +// +(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +} +// +// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. +// +// +// In this branch, |x| >= pi/4. +// +{ .mfi +(p0) ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 +// +// Load -2**(-67) +// +(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X +// +// w = N * P_2 +// s = -N * P_1 + Arg +// +(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_w = FR_N_float, FR_P_2, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Adjust N_fix by N_inc to determine whether sine or +// cosine is being calculated +// +(p0) fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// Remember x >= pi/4. +// Is s <= -2**(-33) or s >= 2**(-33) (p6) +// or -2**(-33) < s < 2**(-33) (p7) +(p6) fms.s1 FR_r = FR_s, f1, FR_w + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w + nop.i 999 +} +{ .mfi + nop.m 999 +(p6) fms.s1 FR_c = FR_s, f1, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For big s: r = s - w: No futher reduction is necessary +// For small s: w = N * P_3 (change sign) More reduction +// +(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fms.s1 FR_r = FR_s, f1, FR_U_1 + nop.i 999 +} +{ .mfb + nop.m 999 +// +// For big s: Is |r| < 2**(-3)? +// For big s: c = S - r +// For small s: U_1 = N * P_2 + w +// +// If p8 is set, prepare to branch to Small_R. +// If p9 is set, prepare to branch to Normal_R. +// For big s, r is complete here. +// +(p6) fms.s1 FR_c = FR_c, f1, FR_w +// +// For big s: c = c + w (w has not been negated.) +// For small s: r = S - U_1 +// +(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +} +{ .mfi +(p7) add GR_Table_Base1 = 224, GR_Table_Base1 +// +// Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R +// +(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 +// +// c = S - U_1 +// r = S_1 * r +// +// +(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +} +{ .mmi + nop.m 999 +// +// Get [i_0,i_1] - two lsb of N_fix_gr. +// Do dummy fmpy so inexact is always set. +// +(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 +(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} +// +// For small s: U_2 = N * P_2 - U_1 +// S_1 stored constant - grab the one stored with the +// coefficients. +// +{ .mfi +(p7) ldfe FR_S_1 = [GR_Table_Base1], 16 +// +// Check if i_1 and i_0 != 0 +// +(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 +(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; +} +{ .mfi + nop.m 999 +(p7) fms.s1 FR_s = FR_s, f1, FR_r + nop.i 999 +} +{ .mfi + nop.m 999 +// +// S = S - r +// U_2 = U_2 + w +// load S_1 +// +(p7) fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fmerge.se FR_Input_X = FR_r, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_Input_X = f0, f1, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// FR_rsq = r * r +// Save r as the result. +// +(p7) fms.s1 FR_c = FR_s, f1, FR_U_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if ( i_1 ==0) poly = c + S_1*r*r*r +// else Result = 1 +// +(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fma.s1 FR_r = FR_S_1, FR_r, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// If i_1 != 0, poly = 2**(-67) +// +(p7) fms.s1 FR_c = FR_c, f1, FR_U_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// c = c - U_2 +// +(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// i_0 != 0, so Result = -Result +// +(p11) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p12) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly +// +// if (i_0 == 0), Result = Result + poly +// else Result = Result - poly +// +(p0) br.ret.sptk b0 ;; +} +L(SINCOSL_LARGER_ARG): +{ .mfi + nop.m 999 +(p0) fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 + nop.i 999 +} +;; + +// This path for argument > 2*24 +// Adjust table_ptr1 to beginning of table. +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +// +// Point to 2*-14 +// N_0 = Arg * Inv_P_0 +// +{ .mmi +(p0) add GR_Table_Base = 688, GR_Table_Base ;; +(p0) ldfs FR_Two_to_M14 = [GR_Table_Base], 4 + nop.i 999 ;; +} +{ .mfi +(p0) ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load values 2**(-14) and -2**(-14) +// +(p0) fcvt.fx.s1 FR_N_0_fix = FR_N_0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N_0_fix = integer part of N_0 +// +(p0) fcvt.xf FR_N_0 = FR_N_0_fix + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Make N_0 the integer part +// +(p0) fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_w = FR_N_0, FR_d_1, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Arg' = -N_0 * P_0 + Arg +// w = N_0 * d_1 +// +(p0) fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N = A' * 2/pi +// +(p0) fcvt.fx.s1 FR_N_fix = FR_N_float + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N_fix is the integer part +// +(p0) fcvt.xf FR_N_float = FR_N_fix + nop.i 999 ;; +} +{ .mfi +(p0) getf.sig GR_N_Inc = FR_N_fix + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; +} +{ .mfi + nop.m 999 +// +// N is the integer part of the reduced-reduced argument. +// Put the integer in a GP register +// +(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// s = -N*P_1 + Arg' +// w = -N*P_2 + w +// N_fix_gr = N_fix_gr + N_inc +// +(p0) fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For |s| > 2**(-14) r = S + w (r complete) +// Else U_hi = N_0 * d_1 +// +(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Either S <= -2**(-14) or S >= 2**(-14) +// or -2**(-14) < s < 2**(-14) +// +(p8) fma.s1 FR_r = FR_s, f1, FR_w + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// We need abs of both U_hi and V_hi - don't +// worry about switched sign of V_hi. +// +(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Big s: finish up c = (S - r) + w (c complete) +// Case 4: A = U_hi + V_hi +// Note: Worry about switched sign of V_hi, so subtract instead of add. +// +(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi + nop.i 999 ;; +} +{ .mmf + nop.m 999 + nop.m 999 +(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi +} +{ .mfi + nop.m 999 +(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// For big s: c = S - r +// For small s do more work: U_lo = N_0 * d_1 - U_hi +// +(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// For big s: Is |r| < 2**(-3) +// For big s: if p12 set, prepare to branch to Small_R. +// For big s: If p13 set, prepare to branch to Normal_R. +// +(p8) fms.s1 FR_c = FR_s, f1, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For small S: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w. +// +(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fma.s1 FR_c = FR_c, f1, FR_w + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w +(p12) br.cond.spnt L(SINCOSL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +} +{ .mfi + nop.m 999 +// +// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. +// The remaining stuff is for Case 4. +// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_lo. +// Small s: w = w + N_0 * d_2 +// Note: the (-) is now incorporated in w. +// +(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs +(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 +} +{ .mfi + nop.m 999 +// +// C_hi = S + A +// +(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo +(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} +{ .mfi + nop.m 999 +// +// t = U_lo + V_lo +// +// +(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A + nop.i 999 +} +;; + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi +(p0) add GR_Table_Base = 528, GR_Table_Base +// +// Is U_hiabs >= V_hiabs? +// +(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A + nop.i 999 ;; +} +{ .mmi +(p0) ldfe FR_C_1 = [GR_Table_Base], 16 ;; +(p0) ldfe FR_C_2 = [GR_Table_Base], 64 + nop.i 999 ;; +} +// +// c = c + C_lo finished. +// Load C_2 +// +{ .mfi +(p0) ldfe FR_S_1 = [GR_Table_Base], 16 +// +// C_lo = S - C_hi +// +(p0) fma.s1 FR_t = FR_t, f1, FR_w + nop.i 999 ;; +} +// +// r and c have been computed. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-3) +// Get [i_0,i_1] - two lsb of N_fix. +// Load S_1 +// +{ .mfi +(p0) ldfe FR_S_2 = [GR_Table_Base], 64 +// +// t = t + w +// +(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi +(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;; +} +{ .mfi + nop.m 999 +// +// For larger u than v: a = U_hi - A +// Else a = V_hi - A (do an add to account for missing (-) on V_hi +// +(p0) fms.s1 FR_C_lo = FR_s, f1, FR_C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a +(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;; +} +{ .mfi + nop.m 999 +// +// If u > v: a = (U_hi - A) + V_hi +// Else a = (V_hi - A) + U_hi +// In each case account for negative missing from V_hi. +// +(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_lo = (S - C_hi) + A +// +(p0) fma.s1 FR_t = FR_t, f1, FR_a + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// t = t + a +// +(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_t + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// C_lo = C_lo + t +// Adjust Table_Base to beginning of table +// +(p0) fma.s1 FR_r = FR_C_hi, f1, FR_C_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load S_2 +// +(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Table_Base points to C_1 +// r = C_hi + C_lo +// +(p0) fms.s1 FR_c = FR_C_hi, f1, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if i_1 ==0: poly = S_2 * FR_rsq + S_1 +// else poly = C_2 * FR_rsq + C_1 +// +(p11) fma.s1 FR_Input_X = f0, f1, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 FR_Input_X = f0, f1, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Compute r_cube = FR_rsq * r +// +(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Compute FR_rsq = r * r +// Is i_1 == 0 ? +// +(p0) fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// c = C_hi - r +// Load C_1 +// +(p0) fma.s1 FR_c = FR_c, f1, FR_C_lo + nop.i 999 +} +{ .mfi + nop.m 999 +// +// if i_1 ==0: poly = r_cube * poly + c +// else poly = FR_rsq * poly +// +(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if i_1 ==0: Result = r +// else Result = 1.0 +// +(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if i_0 !=0: Result = -Result +// +(p9) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p10) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly +// +// if i_0 == 0: Result = Result + poly +// else Result = Result - poly +// +(p0) br.ret.sptk b0 ;; +} +L(SINCOSL_SMALL_R): +{ .mii + nop.m 999 +(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +// +// +// Compare both i_1 and i_0 with 0. +// if i_1 == 0, set p9. +// if i_0 == 0, set p11. +// +(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 +(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} +{ .mfi + nop.m 999 +// +// Z = Z * FR_rsq +// +(p10) fnma.s1 FR_c = FR_c, FR_r, f0 +(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 +} +;; + +// ****************************************************************** +// ****************************************************************** +// ****************************************************************** +// r and c have been computed. +// We know whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-3) +// +// Set table_ptr1 to beginning of constant table. +// Get [i_0,i_1] - two lsb of N_fix_gr. +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +// +// Set table_ptr1 to point to S_5. +// Set table_ptr1 to point to C_5. +// Compute FR_rsq = r * r +// +{ .mfi +(p9) add GR_Table_Base = 672, GR_Table_Base +(p10) fmerge.s FR_r = f1, f1 +(p10) add GR_Table_Base = 592, GR_Table_Base ;; +} +// +// Set table_ptr1 to point to S_5. +// Set table_ptr1 to point to C_5. +// +{ .mmi +(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; +// +// if (i_1 == 0) load S_5 +// if (i_1 != 0) load C_5 +// +(p9) ldfe FR_S_4 = [GR_Table_Base], -16 + nop.i 999 ;; +} +{ .mmf +(p10) ldfe FR_C_5 = [GR_Table_Base], -16 +// +// Z = FR_rsq * FR_rsq +// +(p9) ldfe FR_S_3 = [GR_Table_Base], -16 +// +// Compute FR_rsq = r * r +// if (i_1 == 0) load S_4 +// if (i_1 != 0) load C_4 +// +(p0) fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; +} +// +// if (i_1 == 0) load S_3 +// if (i_1 != 0) load C_3 +// +{ .mmi +(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; +// +// if (i_1 == 0) load S_2 +// if (i_1 != 0) load C_2 +// +(p9) ldfe FR_S_1 = [GR_Table_Base], -16 + nop.i 999 +} +{ .mmi +(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; +(p10) ldfe FR_C_3 = [GR_Table_Base], -16 + nop.i 999 ;; +} +{ .mmi +(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; +(p10) ldfe FR_C_1 = [GR_Table_Base], -16 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// if (i_1 != 0): +// poly_lo = FR_rsq * C_5 + C_4 +// poly_hi = FR_rsq * C_2 + C_1 +// +(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1 == 0) load S_1 +// if (i_1 != 0) load C_1 +// +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// c = -c * r +// dummy fmpy's to flag inexact. +// +(p9) fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = FR_rsq * poly_lo + C_3 +// poly_hi = FR_rsq * poly_hi +// +(p0) fma.s1 FR_Z = FR_Z, FR_rsq, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// if (i_1 == 0): +// poly_lo = FR_rsq * S_5 + S_4 +// poly_hi = FR_rsq * S_2 + S_1 +// +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1 == 0): +// Z = Z * r for only one of the small r cases - not there +// in original implementation notes. +// +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// poly_lo = FR_rsq * poly_lo + S_3 +// poly_hi = FR_rsq * poly_hi +// +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1 == 0): dummy fmpy's to flag inexact +// r = 1 +// +(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// poly_hi = r * poly_hi +// +(p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fms.s1 FR_r = f0, f1, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_hi = Z * poly_lo + c +// if i_0 == 1: r = -r +// +(p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly + nop.i 999 +} +{ .mfb + nop.m 999 +// +// poly = poly + poly_hi +// +(p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly +// +// if (i_0 == 0) Result = r + poly +// if (i_0 != 0) Result = r - poly +// +(p0) br.ret.sptk b0 ;; +} +L(SINCOSL_NORMAL_R): +{ .mii + nop.m 999 +(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +// +// Set table_ptr1 and table_ptr2 to base address of +// constant table. +(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 +(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; +} +{ .mfi + nop.m 999 +(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r +(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 +} +;; + +// ****************************************************************** +// ****************************************************************** +// ****************************************************************** +// +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// Get [i_0,i_1] - two lsb of N_fix_gr alone. +// + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 GR_Table_Base = [GR_Table_Base] + nop.m 999 + nop.i 999 +} +;; + + +{ .mfi +(p10) add GR_Table_Base = 384, GR_Table_Base +(p12) fms.s1 FR_Input_X = f0, f1, f1 +(p9) add GR_Table_Base = 224, GR_Table_Base ;; +} +{ .mfi +(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 +// +// if (i_1==0) poly = poly * FR_rsq + PP_1_lo +// else poly = FR_rsq * poly +// +(p11) fma.s1 FR_Input_X = f0, f1, f1 + nop.i 999 ;; +} +{ .mmb +(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 +// +// Adjust table pointers based on i_0 +// Compute rsq = r * r +// +(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 + nop.i 999 ;; +} +{ .mmf +(p9) ldfe FR_PP_7 = [GR_Table_Base], 16 +(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 +// +// Load PP_8 and QQ_8; PP_7 and QQ_7 +// +(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; +} +// +// if (i_1==0) poly = PP_7 + FR_rsq * PP_8. +// else poly = QQ_7 + FR_rsq * QQ_8. +// +{ .mmb +(p9) ldfe FR_PP_6 = [GR_Table_Base], 16 +(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mmb +(p9) ldfe FR_PP_5 = [GR_Table_Base], 16 +(p10) ldfe FR_S_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mmb +(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 +(p9) ldfe FR_C_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mmb +(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 +(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mmb +(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 +// +// if (i_1=0) corr = corr + c*c +// else corr = corr * c +// +(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 + nop.i 999 ;; +} +// +// if (i_1=0) poly = rsq * poly + PP_5 +// else poly = rsq * poly + QQ_5 +// Load PP_4 or QQ_4 +// +{ .mmi +(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;; +(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// r_hi = frcpa(frcpa(r)). +// r_cube = r * FR_rsq. +// +(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 + nop.i 999 ;; +} +// +// Do dummy multiplies so inexact is always set. +// +{ .mfi +(p9) ldfe FR_PP_2 = [GR_Table_Base], 16 +// +// r_lo = r - r_hi +// +(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 + nop.i 999 ;; +} +{ .mbb +(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 + nop.b 999 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1=0) U_lo = r_hi * r_hi +// else U_lo = r_hi + r +// +(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1=0) corr = C_1 * rsq +// else corr = S_1 * r_cubed + r +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r + nop.i 999 +} +{ .mfi + nop.m 999 +// +// if (i_1=0) U_hi = r_hi + U_hi +// else U_hi = QQ_1 * U_hi + 1 +// +(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// U_hi = r_hi * r_hi +// +(p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Load PP_1, PP_6, PP_5, and C_1 +// Load QQ_1, QQ_6, QQ_5, and S_1 +// +(p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1=0) U_lo = r * r_hi + U_lo +// else U_lo = r_lo * U_lo +// +(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// if (i_1 =0) U_hi = r + U_hi +// if (i_1 =0) U_lo = r_lo * U_lo +// +// +(p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1=0) poly = poly * rsq + PP_6 +// else poly = poly * rsq + QQ_6 +// +(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1!=0) U_hi = PP_1 * U_hi +// if (i_1!=0) U_lo = r * r + U_lo +// Load PP_3 or QQ_3 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load PP_2, QQ_2 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = FR_rsq * poly + PP_3 +// else poly = FR_rsq * poly + QQ_3 +// Load PP_1_lo +// +(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1 =0) poly = poly * rsq + pp_r4 +// else poly = poly * rsq + qq_r4 +// +(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1==0) U_lo = PP_1_hi * U_lo +// else U_lo = QQ_1 * U_lo +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_0==0) Result = 1 +// else Result = -1 +// +(p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = FR_rsq * poly + PP_2 +// else poly = FR_rsq * poly + QQ_2 +// +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// V = U_lo + corr +// +(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if (i_1==0) poly = r_cube * poly +// else poly = FR_rsq * poly +// +(p0) fma.s1 FR_V = FR_poly, f1, FR_V + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V + nop.i 999 +} +{ .mfb + nop.m 999 +// +// V = V + poly +// +(p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V +// +// if (i_0==0) Result = Result * U_hi + V +// else Result = Result * U_hi - V +// +(p0) br.ret.sptk b0 +};; + +// +// If cosine, FR_Input_X = 1 +// If sine, FR_Input_X = +/-Zero (Input FR_Input_X) +// Results are exact, no exceptions +// + +L(SINCOSL_ZERO): +{ .mbb +(p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos + nop.b 999 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X + nop.i 999 +} +{ .mfb + nop.m 999 +(p6) fmerge.s FR_Input_X = f1, f1 +(p0) br.ret.sptk b0 ;; +} +L(SINCOSL_SPECIAL): +{ .mfb + nop.m 999 +// +// Path for Arg = +/- QNaN, SNaN, Inf +// Invalid can be raised. SNaNs +// become QNaNs +// +(p0) fmpy.s0 FR_Input_X = FR_Input_X, f0 +(p0) br.ret.sptk b0 ;; +} +.endp cosl# +ASM_SIZE_DIRECTIVE(cosl#) + +// Call int pi_by_2_reduce(double* x, double *y) +// for |arguments| >= 2**63 +// Address to save r and c as double +// +// sp+32 -> f0 +// r45 sp+16 -> f0 +// r44 -> sp -> InputX +// + +.proc __libm_callout +__libm_callout: +L(SINCOSL_ARG_TOO_LARGE): +.prologue +{ .mfi + add r45=-32,sp // Parameter: r address + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [r45] = f0,16 // Clear Parameter r on stack + add r44 = 16,sp // Parameter x address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [r45] = f0,-16 // Clear Parameter c on stack + nop.i 0 + nop.b 0 +} +{ .mib + stfe [r44] = FR_Input_X // Store Parameter x on stack + nop.i 0 +(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; +};; +{ .mii +(p0) ldfe FR_Input_X =[r44],16 +// +// Get r and c off stack +// +(p0) adds GR_Table_Base1 = -16, GR_Table_Base1 +// +// Get r and c off stack +// +(p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;; +} +{ .mmb +(p0) ldfe FR_r =[r45],16 +// +// Get X off the stack +// Readjust Table ptr +// +(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4 + nop.b 999 ;; +} +{ .mmb +(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 +(p0) ldfe FR_c =[r45] + nop.b 999 ;; +} +{ .mfi +.restore sp + add sp = 64,sp // Restore stack pointer +(p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + nop.b 0 +};; +{ .mfi + nop.m 999 +(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 + nop.i 999 ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(SINCOSL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +} +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) +.type __libm_pi_by_2_reduce#,@function +.global __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/s_expm1.S b/sysdeps/ia64/fpu/s_expm1.S new file mode 100644 index 0000000000..840b1c0b6e --- /dev/null +++ b/sysdeps/ia64/fpu/s_expm1.S @@ -0,0 +1,1755 @@ +.file "exp_m1.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// HISTORY +// 2/02/00 Initial Version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: Combined exp(x) and expm1(x), where +// x +// exp(x) = e , for double precision x values +// x +// expm1(x) = e - 1 for double precision x values +// +// ********************************************************************* +// +// Accuracy: Within .7 ulps for 80-bit floating point values +// Very accurate for double precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f32-f61, f99-f102 +// +// General Purpose Registers: +// r32-r61 +// r62-r65 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions raised when appropriate for exp and expm1 +// Underflow exceptions raised when appropriate for exp and expm1 +// (Error Handling Routine called for overflow and Underflow) +// Inexact raised when appropriate by algorithm +// +// exp(inf) = inf +// exp(-inf) = +0 +// exp(SNaN) = QNaN +// exp(QNaN) = QNaN +// exp(0) = 1 +// exp(EM_special Values) = QNaN +// exp(inf) = inf +// expm1(-inf) = -1 +// expm1(SNaN) = QNaN +// expm1(QNaN) = QNaN +// expm1(0) = 0 +// expm1(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Implementation and Algorithm Notes: +// +// ker_exp_64( in_FR : X, +// in_GR : Flag, +// in_GR : Expo_Range +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : scale, +// out_PR : Safe ) +// +// On input, X is in register format and +// Flag = 0 for exp, +// Flag = 1 for expm1, +// +// On output, provided X and X_cor are real numbers, then +// +// scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0 +// scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1 +// +// The accuracy is sufficient for a highly accurate 64 sig. +// bit implementation. Safe is set if there is no danger of +// overflow/underflow when the result is composed from scale, +// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. +// Otherwise, one must prepare to handle the possible exception +// appropriately. Note that SAFE not set (false) does not mean +// that overflow/underflow will occur; only the setting of SAFE +// guarantees the opposite. +// +// **** High Level Overview **** +// +// The method consists of three cases. +// +// If |X| < Tiny use case exp_tiny; +// else if |X| < 2^(-6) use case exp_small; +// else use case exp_regular; +// +// Case exp_tiny: +// +// 1 + X can be used to approximate exp(X) or exp(X+X_cor); +// X + X^2/2 can be used to approximate exp(X) - 1 +// +// Case exp_small: +// +// Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be +// appproximated by a relatively simple polynomial. +// +// This polynomial resembles the truncated Taylor series +// +// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// +// Case exp_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute exp(X), we accurately decompose X into +// +// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. +// +// Hence +// +// exp(X) = 2^( N / 2^12 ) * exp(r). +// +// The value 2^( N / 2^12 ) is obtained by simple combinations +// of values calculated beforehand and stored in table; exp(r) +// is approximated by a short polynomial because |r| is small. +// +// We elaborate this method in 4 steps. +// +// Step 1: Reduction +// +// The value 2^12/log(2) is stored as a double-extended number +// L_Inv. +// +// N := round_to_nearest_integer( X * L_Inv ) +// +// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so +// that r can be computed accurately via +// +// r := (X - N*L_hi) - N*L_lo +// +// We pick L_hi such that N*L_hi is representable in 64 sig. bits +// and thus the FMA X - N*L_hi is error free. So r is the +// 1 rounding error from an exact reduction with respect to +// +// L_hi + L_lo. +// +// In particular, L_hi has 30 significant bit and can be stored +// as a double-precision number; L_lo has 64 significant bits and +// stored as a double-extended number. +// +// In the case Flag = 2, we further modify r by +// +// r := r + X_cor. +// +// Step 2: Approximation +// +// exp(r) - 1 is approximated by a short polynomial of the form +// +// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . +// +// Step 3: Composition from Table Values +// +// The value 2^( N / 2^12 ) can be composed from a couple of tables +// of precalculated values. First, express N as three integers +// K, M_1, and M_2 as +// +// N = K * 2^12 + M_1 * 2^6 + M_2 +// +// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. +// When N is represented in 2's complement, M_2 is simply the 6 +// lsb's, M_1 is the next 6, and K is simply N shifted right +// arithmetically (sign extended) by 12 bits. +// +// Now, 2^( N / 2^12 ) is simply +// +// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) +// +// Clearly, 2^K needs no tabulation. The other two values are less +// trivial because if we store each accurately to more than working +// precision, than its product is too expensive to calculate. We +// use the following method. +// +// Define two mathematical values, delta_1 and delta_2, implicitly +// such that +// +// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) +// +// are representable as 24 significant bits. To illustrate the idea, +// we show how we define delta_1: +// +// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) +// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) +// +// The last equality means mathematical equality. We then tabulate +// +// W_1 := exp(delta_1) - 1 +// W_2 := exp(delta_2) - 1 +// +// Both in double precision. +// +// From the tabulated values T_1, T_2, W_1, W_2, we compose the values +// T and W via +// +// T := T_1 * T_2 ...exactly +// W := W_1 + (1 + W_1)*W_2 +// +// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. +// The mathematical product of T and (W+1) is an accurate representation +// of 2^(M_1/2^6) * 2^(M_2/2^12). +// +// Step 4. Reconstruction +// +// Finally, we can reconstruct exp(X), exp(X) - 1. +// Because +// +// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) +// + (M_2*log(2)/2^12 - delta_2) +// + delta_1 + delta_2 + r ...accurately +// We have +// +// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(exp(delta)-1) +// + exp(delta)*(exp(r)-1)] ) +// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) +// ~=~ 2^K * ( Y_hi + Y_lo ) +// +// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) +// +// For exp(X)-1, we have +// +// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) +// +// and we combine Y_hi + Y_lo - 2^(-N) into the form of two +// numbers Y_hi + Y_lo carefully. +// +// **** Algorithm Details **** +// +// A careful algorithm must be used to realize the mathematical ideas +// accurately. We describe each of the three cases. We assume SAFE +// is preset to be TRUE. +// +// Case exp_tiny: +// +// The important points are to ensure an accurate result under +// different rounding directions and a correct setting of the SAFE +// flag. +// +// If Flag is 1, then +// SAFE := False ...possibility of underflow +// Scale := 1.0 +// Y_hi := X +// Y_lo := 2^(-17000) +// Else +// Scale := 1.0 +// Y_hi := 1.0 +// Y_lo := X ...for different rounding modes +// Endif +// +// Case exp_small: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into several portions. +// +// Let r = X +// +// If Flag is not 1 ...i.e. exp( argument ) +// +// rsq := r * r; +// r4 := rsq*rsq +// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) +// poly_hi := r + rsq*(P_1 + r*P_2) +// Y_lo := poly_hi + r4 * poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := 1.0 +// Scale := 1.0 +// +// Else ...i.e. exp( argument ) - 1 +// +// rsq := r * r +// r4 := rsq * rsq +// r6 := rsq * r4 +// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) +// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) +// Y_lo := rsq*poly_hi + poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := X +// Scale := 1.0 +// +// Endif +// +// Case exp_regular: +// +// The previous description contain enough information except the +// computation of poly and the final Y_hi and Y_lo in the case for +// exp(X)-1. +// +// The computation of poly for Step 2: +// +// rsq := r*r +// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) +// +// For the case exp(X) - 1, we need to incorporate 2^(-K) into +// Y_hi and Y_lo at the end of Step 4. +// +// If K > 10 then +// Y_lo := Y_lo - 2^(-K) +// Else +// If K < -10 then +// Y_lo := Y_hi + Y_lo +// Y_hi := -2^(-K) +// Else +// Y_hi := Y_hi - 2^(-K) +// End If +// End If +// + +#include "libm_support.h" + +GR_SAVE_PFS = r59 +GR_SAVE_B0 = r60 +GR_SAVE_GP = r61 + +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 + +FR_X = f9 +FR_Y = f1 +FR_RESULT = f99 + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 64 +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +// /* Inv_L, L_hi, L_lo */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) + +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) + +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) + +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) + +.align 64 +Constants_exp_64_Q: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) +data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 +data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 +data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 +data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 +data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x00000000,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) + +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 +data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC +data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D +data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA +data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 +data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A +data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 +data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B +data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD +data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 +data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B +data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 +data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A +data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 +data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) + +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) + +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) + +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) + +.section .text +.proc expm1# +.global expm1# +.align 64 + +expm1: +#ifdef _LIBC +.global __expm1# +__expm1: +#endif + + +{ .mii + alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = 1, r0 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} +;; + + +// +// Set p7 true for expm1 +// Set Flag = r33 = 1 for expm1 +// These are really no longer necesary, but are a remnant +// when this file had multiple entry points. +// They should be carefully removed + + + +{ .mfi +(p0) add r32 = 1,r0 +(p0) fnorm.s1 f9 = f8 + nop.i 999 +} + + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p8 = f8, 0x1E7 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p9, p0 = f8, 0x1FF + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) mov f36 = f1 + nop.i 999 ;; +} + +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Identify EM unsupporteds. +// Save special input registers +// +// Create FR_X_cor = 0.0 +// GR_Flag = 0 +// GR_Expo_Range = 1 +// FR_Scale = 1.0 +// + +{ .mfb + nop.m 999 +(p0) mov f32 = f0 +(p6) br.cond.spnt EXP_64_SPECIAL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt EXP_64_UNSUPPORTED ;; +} + +// +// Branch out for special input values +// + +{ .mfi +(p0) cmp.ne.unc p12, p13 = 0x01, r33 +(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 +(p0) cmp.eq.unc p15, p0 = r0, r0 +} + +// +// Raise possible denormal operand exception +// Normalize x +// +// This function computes exp( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe + +// +// Prepare to load constants +// Set Safe = True +// + +{ .mmi +(p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp +(p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp +(p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp +} +;; + +{ .mmi + ld8 r34 = [r34] + ld8 r40 = [r40] +(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp +} +;; + + +{ .mmi + ld8 r41 = [r41] +(p0) ldfe f37 = [r34],16 +(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp +} +;; + +// +// N = fcvt.fx(float_N) +// Set p14 if -6 > expo_X +// + + +// +// Bias = 0x0FFFF +// expo_X = expo_X and Mask +// + +// +// Load L_lo +// Set p10 if 14 < expo_X +// + +{ .mmi + ld8 r50 = [r50] +(p0) ldfe f40 = [r34],16 + nop.i 999 +} +;; + +{ .mlx + nop.m 999 +(p0) movl r58 = 0x0FFFF +} +;; + +// +// Load W2_ptr +// Branch to SMALL is expo_X < -6 +// + +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// + +{ .mmi + ld8 r51 = [r51] +(p0) ldfe f41 = [r34],16 +} +;; + +{ .mlx +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp +(p0) movl r39 = 0x1FFFF +} +;; + +{ .mmi + ld8 r34 = [r34] +(p0) getf.exp r37 = f9 + nop.i 999 +} +;; + +{ .mii + nop.m 999 + nop.i 999 +(p0) and r37 = r37, r39 ;; +} + +{ .mmi +(p0) sub r37 = r37, r58 ;; +(p0) cmp.gt.unc p14, p0 = -6, r37 +(p0) cmp.lt.unc p10, p0 = 14, r37 ;; +} + +{ .mfi + nop.m 999 +// +// Load L_inv +// Set p12 true for Flag = 0 (exp) +// Set p13 true for Flag = 1 (expm1) +// +(p0) fmpy.s1 f38 = f9, f37 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Load L_hi +// expo_X = expo_X - Bias +// get W1_ptr +// +(p0) fcvt.fx.s1 f39 = f38 +(p14) br.cond.spnt EXP_SMALL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt EXP_HUGE ;; +} + +{ .mmi +(p0) shladd r34 = r32,4,r34 +(p0) addl r35 = @ltoff(Constants_exp_64_A#), gp + nop.i 999 +} +;; + +{ .mmi + ld8 r35 = [r35] + nop.m 999 + nop.i 999 +} +;; + +// +// Load T_1,T_2 +// + +{ .mmb +(p0) ldfe f51 = [r35],16 +(p0) ld8 r45 = [r34],8 + nop.b 999 ;; +} +// +// Set Safe = True if k >= big_expo_neg +// Set Safe = False if k < big_expo_neg +// + +{ .mmb +(p0) ldfe f49 = [r35],16 +(p0) ld8 r48 = [r34],0 + nop.b 999 ;; +} + +{ .mfi + nop.m 999 +// +// Branch to HUGE is expo_X > 14 +// +(p0) fcvt.xf f38 = f39 + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r52 = f39 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) extr.u r43 = r52, 6, 6 ;; +// +// r = r - float_N * L_lo +// K = extr(N_fix,12,52) +// +(p0) shladd r40 = r43,3,r40 ;; +} + +{ .mfi +(p0) shladd r50 = r43,2,r50 +(p0) fnma.s1 f42 = f40, f38, f9 +// +// float_N = float(N) +// N_fix = signficand N +// +(p0) extr.u r42 = r52, 0, 6 +} + +{ .mmi +(p0) ldfd f43 = [r40],0 ;; +(p0) shladd r41 = r42,3,r41 +(p0) shladd r51 = r42,2,r51 +} +// +// W_1_p1 = 1 + W_1 +// + +{ .mmi +(p0) ldfs f44 = [r50],0 ;; +(p0) ldfd f45 = [r41],0 +// +// M_2 = extr(N_fix,0,6) +// M_1 = extr(N_fix,6,6) +// r = X - float_N * L_hi +// +(p0) extr r44 = r52, 12, 52 +} + +{ .mmi +(p0) ldfs f46 = [r51],0 ;; +(p0) sub r46 = r58, r44 +(p0) cmp.gt.unc p8, p15 = r44, r45 +} +// +// W = W_1 + W_1_p1*W_2 +// Load A_2 +// Bias_m_K = Bias - K +// + +{ .mii +(p0) ldfe f40 = [r35],16 +// +// load A_1 +// poly = A_2 + r*A_3 +// rsq = r * r +// neg_2_mK = exponent of Bias_m_k +// +(p0) add r47 = r58, r44 ;; +// +// Set Safe = True if k <= big_expo_pos +// Set Safe = False if k > big_expo_pos +// Load A_3 +// +(p15) cmp.lt p8,p15 = r44,r48 ;; +} + +{ .mmf +(p0) setf.exp f61 = r46 +// +// Bias_p + K = Bias + K +// T = T_1 * T_2 +// +(p0) setf.exp f36 = r47 +(p0) fnma.s1 f42 = f41, f38, f42 ;; +} + +{ .mfi + nop.m 999 +// +// Load W_1,W_2 +// Load big_exp_pos, load big_exp_neg +// +(p0) fadd.s1 f47 = f43, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f51, f49 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 f53 = f44, f46 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f54 = f45, f47, f43 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fneg f61 = f61 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f52, f40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 f55 = f54, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// W + Wp1 * poly +// +(p0) mov f34 = f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// A_1 + r * poly +// Scale = setf_exp(Bias_p_k) +// +(p0) fma.s1 f52 = f48, f52, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly = r + rsq(A_1 + r*poly) +// Wp1 = 1 + W +// neg_2_mK = -neg_2_mK +// +(p0) fma.s1 f35 = f55, f52, f54 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmpy.s1 f35 = f35, f53 +// +// Y_hi = T +// Y_lo = T * (W + Wp1*poly) +// +(p12) br.cond.sptk EXP_MAIN ;; +} +// +// Branch if exp(x) +// Continue for exp(x-1) +// + +{ .mii +(p0) cmp.lt.unc p12, p13 = 10, r44 + nop.i 999 ;; +// +// Set p12 if 10 < K, Else p13 +// +(p13) cmp.gt.unc p13, p14 = -10, r44 ;; +} +// +// K > 10: Y_lo = Y_lo + neg_2_mK +// K <=10: Set p13 if -10 > K, Else set p14 +// + +{ .mfi +(p13) cmp.eq p15, p0 = r0, r0 +(p14) fadd.s1 f34 = f61, f34 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fadd.s1 f35 = f35, f61 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fadd.s1 f35 = f35, f34 + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// K <= 10 and K < -10, Set Safe = True +// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo +// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk +// +(p13) mov f34 = f61 +(p0) br.cond.sptk EXP_MAIN ;; +} +EXP_SMALL: + +{ .mmi +(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp + nop.i 999 +} +;; + +{ .mmi +(p12) ld8 r35 = [r35] + ld8 r34 = [r34] + nop.i 999 +} +;; + + +{ .mmi +(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp + nop.m 999 + nop.i 999 +} +;; + + +// +// Return +// K <= 10 and K < 10, Y_hi = neg_2_mk +// +// /*******************************************************/ +// /*********** Branch EXP_SMALL *************************/ +// /*******************************************************/ + +{ .mfi +(p13) ld8 r35 = [r35] +(p0) mov f42 = f9 +(p0) add r34 = 0x48,r34 +} +;; + +// +// Flag = 0 +// r4 = rsq * rsq +// + +{ .mfi +(p0) ld8 r49 =[r34],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Flag = 1 +// +(p0) cmp.lt.unc p14, p0 = r37, r49 ;; +} + +{ .mfi + nop.m 999 +// +// r = X +// +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// rsq = r * r +// +(p0) fmpy.s1 f50 = f48, f48 +// +// Is input very small? +// +(p14) br.cond.spnt EXP_VERY_SMALL ;; +} +// +// Flag_not1: Y_hi = 1.0 +// Flag is 1: r6 = rsq * r4 +// + +{ .mfi +(p12) ldfe f52 = [r35],16 +(p12) mov f34 = f1 +(p0) add r53 = 0x1,r0 ;; +} + +{ .mfi +(p13) ldfe f51 = [r35],16 +// +// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo +// +(p13) mov f34 = f9 + nop.i 999 ;; +} + +{ .mmf +(p12) ldfe f53 = [r35],16 +// +// For Flag_not_1, Y_hi = X +// Scale = 1 +// Create 0x000...01 +// +(p0) setf.sig f37 = r53 +(p0) mov f36 = f1 ;; +} + +{ .mmi +(p13) ldfe f52 = [r35],16 ;; +(p12) ldfe f54 = [r35],16 + nop.i 999 ;; +} + +{ .mfi +(p13) ldfe f53 = [r35],16 +(p13) fmpy.s1 f58 = f48, f50 + nop.i 999 ;; +} +// +// Flag_not1: poly_lo = P_5 + r*P_6 +// Flag_1: poly_lo = Q_6 + r*Q_7 +// + +{ .mmi +(p13) ldfe f54 = [r35],16 ;; +(p12) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f56 = [r35],16 ;; +(p13) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f57 = [r35],0 ;; +(p13) ldfe f56 = [r35],16 + nop.i 999 ;; +} + +{ .mfi +(p13) ldfe f57 = [r35],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For Flag_not_1, load p5,p6,p1,p2 +// Else load p5,p6,p1,p2 +// +(p12) fma.s1 f60 = f52, f42, f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f51, f42, f52 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f54 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f56, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f42, f60, f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f59, f48, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) +// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) +// Flag_not1: poly_hi = (P_1 + r*P_2) +// +(p13) fmpy.s1 f60 = f60, f58 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f55 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = r6 *(Q_5 + ....) +// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) +// +(p12) fma.s1 f35 = f60, f50, f59 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f59 = f54, f42, f55 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not1: Y_lo = rsq* poly_hi + poly_lo +// Flag_1: poly_lo = rsq* poly_hi + poly_lo +// +(p13) fma.s1 f59 = f59, f42, f56 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: (P_1 + r*P_2) +// +(p13) fma.s1 f59 = f59, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) +// +(p13) fma.s1 f35 = f59, f48, f60 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Create 0.000...01 +// +(p0) for f37 = f35, f37 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Set lsb of Y_lo to 1 +// +(p0) fmerge.se f35 = f35,f37 +(p0) br.cond.sptk EXP_MAIN ;; +} +EXP_VERY_SMALL: + +{ .mmi + nop.m 999 +(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp + nop.i 999;; +} + +{ .mfi +(p13) ld8 r34 = [r34]; +(p12) mov f35 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p12) mov f34 = f1 +(p12) br.cond.sptk EXP_MAIN ;; +} + +{ .mlx +(p13) add r34 = 8,r34 +(p13) movl r39 = 0x0FFFE ;; +} +// +// Load big_exp_neg +// Create 1/2's exponent +// + +{ .mii +(p13) setf.exp f56 = r39 +(p13) shladd r34 = r32,4,r34 ;; + nop.i 999 +} +// +// Negative exponents are stored after positive +// + +{ .mfi +(p13) ld8 r45 = [r34],0 +// +// Y_hi = x +// Scale = 1 +// +(p13) fmpy.s1 f35 = f9, f9 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Reset Safe if necessary +// Create 1/2 +// +(p13) mov f34 = f9 + nop.i 999 ;; +} + +{ .mfi +(p13) cmp.lt.unc p0, p15 = r37, r45 +(p13) mov f36 = f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Y_lo = x * x +// +(p13) fmpy.s1 f35 = f35, f56 +// +// Y_lo = x*x/2 +// +(p13) br.cond.sptk EXP_MAIN ;; +} +EXP_HUGE: + +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl r39 = 0x15DC0 ;; +} + +{ .mfi +(p14) setf.exp f34 = r39 +(p14) mov f35 = f1 +(p14) cmp.eq p0, p15 = r0, r0 ;; +} + +{ .mfb + nop.m 999 +(p14) mov f36 = f34 +// +// If x > 0, Set Safe = False +// If x > 0, Y_hi = 2**(24,000) +// If x > 0, Y_lo = 1.0 +// If x > 0, Scale = 2**(24,000) +// +(p14) br.cond.sptk EXP_MAIN ;; +} + +{ .mlx + nop.m 999 +(p12) movl r39 = 0xA240 +} + +{ .mlx + nop.m 999 +(p12) movl r38 = 0xA1DC ;; +} + +{ .mmb +(p13) cmp.eq p15, p14 = r0, r0 +(p12) setf.exp f34 = r39 + nop.b 999 ;; +} + +{ .mlx +(p12) setf.exp f35 = r38 +(p13) movl r39 = 0xFF9C +} + +{ .mfi + nop.m 999 +(p13) fsub.s1 f34 = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) mov f36 = f34 +(p12) cmp.eq p0, p15 = r0, r0 ;; +} + +{ .mfi +(p13) setf.exp f35 = r39 +(p13) mov f36 = f1 + nop.i 999 ;; +} +EXP_MAIN: + +{ .mfi +(p0) cmp.ne.unc p12, p0 = 0x01, r33 +(p0) fmpy.s1 f101 = f36, f35 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.d.s0 f99 = f34, f36, f101 +(p15) br.cond.sptk EXP_64_RETURN;; +} + +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl r50 = 0x000000000103FF ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + RZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// + +{ .mfi +(p0) setf.exp f60 = r50 +(p0) fma.d.s3 f102 = f34, f36, f101 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For Safe, no need to check for over/under. +// For expm1, handle errors like exp. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s2 f100 = f34, f36, f101 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p12, p0 = f102, 0x00F + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f102, 0x00F + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// +(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 + nop.i 999 ;; +} +// +// fcmp: resultS2 >= + overflow threshold -> set (a) if true +// fcmp: resultS2 <= - overflow threshold -> set (b) if true +// fclass: resultS3 is denorm/unorm/0 -> set (d) if true +// + +{ .mib +(p10) mov r65 = 41 + nop.i 999 +(p10) br.cond.sptk __libm_error_region ;; +} + +{ .mib +(p8) mov r65 = 14 + nop.i 999 +(p8) br.cond.sptk __libm_error_region ;; +} +// +// Report that exp overflowed +// + +{ .mib +(p12) mov r65 = 42 + nop.i 999 +(p12) br.cond.sptk __libm_error_region ;; +} + +{ .mib +(p11) mov r65 = 15 + nop.i 999 +(p11) br.cond.sptk __libm_error_region ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Report that exp underflowed +// +(p0) br.cond.sptk EXP_64_RETURN;; +} +EXP_64_SPECIAL: + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = f8, 0x0c3 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p13, p8 = f8, 0x007 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p14, p0 = f8, 0x007 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p12, p9 = f8, 0x021 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f8, 0x022 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Identify +/- 0, Inf, or -Inf +// Generate the right kind of NaN. +// +(p13) fadd.d.s0 f99 = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) mov f99 = f8 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fadd.d.s0 f99 = f8, f1 +// +// exp(+/-0) = 1 +// expm1(+/-0) = +/-0 +// No exceptions raised +// +(p6) br.cond.sptk EXP_64_RETURN;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.sptk EXP_64_RETURN;; +} + +{ .mfi + nop.m 999 +(p11) mov f99 = f0 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p10) fsub.d.s1 f99 = f0, f1 +// +// exp(-Inf) = 0 +// expm1(-Inf) = -1 +// No exceptions raised. +// +(p10) br.cond.sptk EXP_64_RETURN;; +} + +{ .mfb + nop.m 999 +(p12) fmpy.d.s1 f99 = f8, f1 +// +// exp(+Inf) = Inf +// No exceptions raised. +// +(p0) br.cond.sptk EXP_64_RETURN;; +} + + +EXP_64_UNSUPPORTED: + +{ .mfb + nop.m 999 +(p0) fmpy.d.s0 f99 = f8, f0 + nop.b 0;; +} + +EXP_64_RETURN: +{ .mfb + nop.m 999 +(p0) mov f8 = f99 +(p0) br.ret.sptk b0 +} +.endp expm1 +ASM_SIZE_DIRECTIVE(expm1) + +.proc __libm_error_region +__libm_error_region: +.prologue +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_expm1f.S b/sysdeps/ia64/fpu/s_expm1f.S new file mode 100644 index 0000000000..b317baea13 --- /dev/null +++ b/sysdeps/ia64/fpu/s_expm1f.S @@ -0,0 +1,1742 @@ +.file "exp_m1f.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// HISTORY +// 2/02/00 Initial Version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: Combined expf(x) and expm1f(x), where +// x +// expf(x) = e , for single precision x values +// x +// expm1f(x) = e - 1 for single precision x values +// +// ********************************************************************* +// +// Accuracy: Within .7 ulps for 80-bit floating point values +// Very accurate for single precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f32-f61, f99-f102 +// +// General Purpose Registers: +// r32-r61 +// r62-r65 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions raised when appropriate for exp and expm1 +// Underflow exceptions raised when appropriate for exp and expm1 +// (Error Handling Routine called for overflow and Underflow) +// Inexact raised when appropriate by algorithm +// +// expf(inf) = inf +// expf(-inf) = +0 +// expf(SNaN) = QNaN +// expf(QNaN) = QNaN +// expf(0) = 1 +// expf(EM_special Values) = QNaN +// expf(inf) = inf +// expm1f(-inf) = -1 +// expm1f(SNaN) = QNaN +// expm1f(QNaN) = QNaN +// expm1f(0) = 0 +// expm1f(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Implementation and Algorithm Notes: +// +// ker_exp_64( in_FR : X, +// in_GR : Flag, +// in_GR : Expo_Range +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : scale, +// out_PR : Safe ) +// +// On input, X is in register format and +// Flag = 0 for exp, +// Flag = 1 for expm1, +// +// On output, provided X and X_cor are real numbers, then +// +// scale*(Y_hi + Y_lo) approximates expf(X) if Flag is 0 +// scale*(Y_hi + Y_lo) approximates expf(X)-1 if Flag is 1 +// +// The accuracy is sufficient for a highly accurate 64 sig. +// bit implementation. Safe is set if there is no danger of +// overflow/underflow when the result is composed from scale, +// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. +// Otherwise, one must prepare to handle the possible exception +// appropriately. Note that SAFE not set (false) does not mean +// that overflow/underflow will occur; only the setting of SAFE +// guarantees the opposite. +// +// **** High Level Overview **** +// +// The method consists of three cases. +// +// If |X| < Tiny use case exp_tiny; +// else if |X| < 2^(-6) use case exp_small; +// else use case exp_regular; +// +// Case exp_tiny: +// +// 1 + X can be used to approximate expf(X) or expf(X+X_cor); +// X + X^2/2 can be used to approximate expf(X) - 1 +// +// Case exp_small: +// +// Here, expf(X), expf(X+X_cor), and expf(X) - 1 can all be +// appproximated by a relatively simple polynomial. +// +// This polynomial resembles the truncated Taylor series +// +// expf(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// +// Case exp_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute expf(X), we accurately decompose X into +// +// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. +// +// Hence +// +// expf(X) = 2^( N / 2^12 ) * expf(r). +// +// The value 2^( N / 2^12 ) is obtained by simple combinations +// of values calculated beforehand and stored in table; expf(r) +// is approximated by a short polynomial because |r| is small. +// +// We elaborate this method in 4 steps. +// +// Step 1: Reduction +// +// The value 2^12/log(2) is stored as a double-extended number +// L_Inv. +// +// N := round_to_nearest_integer( X * L_Inv ) +// +// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so +// that r can be computed accurately via +// +// r := (X - N*L_hi) - N*L_lo +// +// We pick L_hi such that N*L_hi is representable in 64 sig. bits +// and thus the FMA X - N*L_hi is error free. So r is the +// 1 rounding error from an exact reduction with respect to +// +// L_hi + L_lo. +// +// In particular, L_hi has 30 significant bit and can be stored +// as a double-precision number; L_lo has 64 significant bits and +// stored as a double-extended number. +// +// In the case Flag = 2, we further modify r by +// +// r := r + X_cor. +// +// Step 2: Approximation +// +// expf(r) - 1 is approximated by a short polynomial of the form +// +// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . +// +// Step 3: Composition from Table Values +// +// The value 2^( N / 2^12 ) can be composed from a couple of tables +// of precalculated values. First, express N as three integers +// K, M_1, and M_2 as +// +// N = K * 2^12 + M_1 * 2^6 + M_2 +// +// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. +// When N is represented in 2's complement, M_2 is simply the 6 +// lsb's, M_1 is the next 6, and K is simply N shifted right +// arithmetically (sign extended) by 12 bits. +// +// Now, 2^( N / 2^12 ) is simply +// +// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) +// +// Clearly, 2^K needs no tabulation. The other two values are less +// trivial because if we store each accurately to more than working +// precision, than its product is too expensive to calculate. We +// use the following method. +// +// Define two mathematical values, delta_1 and delta_2, implicitly +// such that +// +// T_1 = expf( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = expf( [M_2 log(2)/2^12] - delta_2 ) +// +// are representable as 24 significant bits. To illustrate the idea, +// we show how we define delta_1: +// +// T_1 := round_to_24_bits( expf( M_1 log(2)/2^6 ) ) +// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) +// +// The last equality means mathematical equality. We then tabulate +// +// W_1 := expf(delta_1) - 1 +// W_2 := expf(delta_2) - 1 +// +// Both in double precision. +// +// From the tabulated values T_1, T_2, W_1, W_2, we compose the values +// T and W via +// +// T := T_1 * T_2 ...exactly +// W := W_1 + (1 + W_1)*W_2 +// +// W approximates expf( delta ) - 1 where delta = delta_1 + delta_2. +// The mathematical product of T and (W+1) is an accurate representation +// of 2^(M_1/2^6) * 2^(M_2/2^12). +// +// Step 4. Reconstruction +// +// Finally, we can reconstruct expf(X), expf(X) - 1. +// Because +// +// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) +// + (M_2*log(2)/2^12 - delta_2) +// + delta_1 + delta_2 + r ...accurately +// We have +// +// expf(X) ~=~ 2^K * ( T + T*[expf(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[expf(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(expf(delta)-1) +// + expf(delta)*(expf(r)-1)] ) +// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) +// ~=~ 2^K * ( Y_hi + Y_lo ) +// +// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) +// +// For expf(X)-1, we have +// +// expf(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) +// +// and we combine Y_hi + Y_lo - 2^(-N) into the form of two +// numbers Y_hi + Y_lo carefully. +// +// **** Algorithm Details **** +// +// A careful algorithm must be used to realize the mathematical ideas +// accurately. We describe each of the three cases. We assume SAFE +// is preset to be TRUE. +// +// Case exp_tiny: +// +// The important points are to ensure an accurate result under +// different rounding directions and a correct setting of the SAFE +// flag. +// +// If Flag is 1, then +// SAFE := False ...possibility of underflow +// Scale := 1.0 +// Y_hi := X +// Y_lo := 2^(-17000) +// Else +// Scale := 1.0 +// Y_hi := 1.0 +// Y_lo := X ...for different rounding modes +// Endif +// +// Case exp_small: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into several portions. +// +// Let r = X +// +// If Flag is not 1 ...i.e. expf( argument ) +// +// rsq := r * r; +// r4 := rsq*rsq +// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) +// poly_hi := r + rsq*(P_1 + r*P_2) +// Y_lo := poly_hi + r4 * poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := 1.0 +// Scale := 1.0 +// +// Else ...i.e. expf( argument ) - 1 +// +// rsq := r * r +// r4 := rsq * rsq +// r6 := rsq * r4 +// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) +// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) +// Y_lo := rsq*poly_hi + poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := X +// Scale := 1.0 +// +// Endif +// +// Case exp_regular: +// +// The previous description contain enough information except the +// computation of poly and the final Y_hi and Y_lo in the case for +// expf(X)-1. +// +// The computation of poly for Step 2: +// +// rsq := r*r +// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) +// +// For the case expf(X) - 1, we need to incorporate 2^(-K) into +// Y_hi and Y_lo at the end of Step 4. +// +// If K > 10 then +// Y_lo := Y_lo - 2^(-K) +// Else +// If K < -10 then +// Y_lo := Y_hi + Y_lo +// Y_hi := -2^(-K) +// Else +// Y_hi := Y_hi - 2^(-K) +// End If +// End If +// + +#include "libm_support.h" + + +GR_SAVE_B0 = r60 +GR_SAVE_PFS = r59 +GR_SAVE_GP = r61 + +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 + +FR_X = f9 +FR_Y = f1 +FR_RESULT = f99 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 64 +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +// /* Inv_L, L_hi, L_lo */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) + +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) + +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) + +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) + +.align 64 +Constants_exp_64_Q: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) +data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 +data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 +data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 +data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 +data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x00000000,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) + +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 +data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC +data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D +data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA +data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 +data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A +data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 +data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B +data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD +data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 +data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B +data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 +data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A +data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 +data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) + +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) + +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) + +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) + +.section .text +.proc expm1f# +.global expm1f# +.align 64 + +expm1f: +#ifdef _LIBC +.global __expm1f# +__expm1f: +#endif + + +{ .mii + alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = 1, r0 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} +;; + +// +// Set p7 true for expm1 +// Set Flag = r33 = 1 for expm1 +// These are really no longer necesary, but are a remnant +// when this file had multiple entry points. +// They should be carefully removed + + +{ .mfi +(p0) add r32 = 0,r0 +(p0) fnorm.s1 f9 = f8 + nop.i 0 +} + +{ .mfi + nop.m 0 +// +// Set p7 false for exp +// Set Flag = r33 = 0 for exp +// +(p0) fclass.m.unc p6, p8 = f8, 0x1E7 + nop.i 0 ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p9, p0 = f8, 0x1FF + nop.i 0 +} + +{ .mfi + nop.m 999 +(p0) mov f36 = f1 + nop.i 999 ;; +} + +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Identify EM unsupporteds. +// Save special input registers +// +// Create FR_X_cor = 0.0 +// GR_Flag = 0 +// GR_Expo_Range = 0 (r32) for single precision +// FR_Scale = 1.0 +// + +{ .mfb + nop.m 999 +(p0) mov f32 = f0 +(p6) br.cond.spnt EXPF_64_SPECIAL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt EXPF_64_UNSUPPORTED ;; +} + +// +// Branch out for special input values +// + +{ .mfi +(p0) cmp.ne.unc p12, p13 = 0x01, r33 +(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 +(p0) cmp.eq.unc p15, p0 = r0, r0 +} + +// +// Raise possible denormal operand exception +// Normalize x +// +// This function computes expf( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe + +// +// Prepare to load constants +// Set Safe = True +// + +{ .mmi +(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp +(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp +(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp +};; + +{ .mmi + ld8 r34 = [r34] + ld8 r40 = [r40] +(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp +} +;; +{ .mmi + ld8 r41 = [r41] +(p0) ldfe f37 = [r34],16 +(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp +} +;; +// +// N = fcvt.fx(float_N) +// Set p14 if -6 > expo_X +// +// +// Bias = 0x0FFFF +// expo_X = expo_X and Mask +// + +{ .mmi + ld8 r50 = [r50] +(p0) ldfe f40 = [r34],16 + nop.i 999 +} +;; + +{ .mlx + nop.m 999 +(p0) movl r58 = 0x0FFFF +};; + +// +// Load W2_ptr +// Branch to SMALL is expo_X < -6 +// +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// + +{ .mmi + ld8 r51 = [r51] +(p0) ldfe f41 = [r34],16 +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// + nop.i 0 +};; + +{ .mlx +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp +(p0) movl r39 = 0x1FFFF +} +;; + +{ .mmi + ld8 r34 = [r34] +(p0) getf.exp r37 = f9 + nop.i 999 +} +;; + +{ .mii + nop.m 999 + nop.i 999 +(p0) and r37 = r37, r39 ;; +} + +{ .mmi +(p0) sub r37 = r37, r58 ;; +(p0) cmp.gt.unc p14, p0 = -6, r37 +(p0) cmp.lt.unc p10, p0 = 14, r37 ;; +} + +{ .mfi + nop.m 999 +// +// Load L_inv +// Set p12 true for Flag = 0 (exp) +// Set p13 true for Flag = 1 (expm1) +// +(p0) fmpy.s1 f38 = f9, f37 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Load L_hi +// expo_X = expo_X - Bias +// get W1_ptr +// +(p0) fcvt.fx.s1 f39 = f38 +(p14) br.cond.spnt EXPF_SMALL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt EXPF_HUGE ;; +} + +{ .mmi +(p0) shladd r34 = r32,4,r34 +(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp + nop.i 999 +} +;; + +{ .mmi + ld8 r35 = [r35] + nop.m 999 + nop.i 999 +} +;; + +// +// Load T_1,T_2 +// + +{ .mmb +(p0) ldfe f51 = [r35],16 +(p0) ld8 r45 = [r34],8 + nop.b 999 ;; +} +// +// Set Safe = True if k >= big_expo_neg +// Set Safe = False if k < big_expo_neg +// + +{ .mmb +(p0) ldfe f49 = [r35],16 +(p0) ld8 r48 = [r34],0 + nop.b 999 ;; +} + +{ .mfi + nop.m 999 +// +// Branch to HUGE is expo_X > 14 +// +(p0) fcvt.xf f38 = f39 + nop.i 999 ;; +} + +{ .mfi +(p0) getf.sig r52 = f39 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) extr.u r43 = r52, 6, 6 ;; +// +// r = r - float_N * L_lo +// K = extr(N_fix,12,52) +// +(p0) shladd r40 = r43,3,r40 ;; +} + +{ .mfi +(p0) shladd r50 = r43,2,r50 +(p0) fnma.s1 f42 = f40, f38, f9 +// +// float_N = float(N) +// N_fix = signficand N +// +(p0) extr.u r42 = r52, 0, 6 +} + +{ .mmi +(p0) ldfd f43 = [r40],0 ;; +(p0) shladd r41 = r42,3,r41 +(p0) shladd r51 = r42,2,r51 +} +// +// W_1_p1 = 1 + W_1 +// + +{ .mmi +(p0) ldfs f44 = [r50],0 ;; +(p0) ldfd f45 = [r41],0 +// +// M_2 = extr(N_fix,0,6) +// M_1 = extr(N_fix,6,6) +// r = X - float_N * L_hi +// +(p0) extr r44 = r52, 12, 52 +} + +{ .mmi +(p0) ldfs f46 = [r51],0 ;; +(p0) sub r46 = r58, r44 +(p0) cmp.gt.unc p8, p15 = r44, r45 +} +// +// W = W_1 + W_1_p1*W_2 +// Load A_2 +// Bias_m_K = Bias - K +// + +{ .mii +(p0) ldfe f40 = [r35],16 +// +// load A_1 +// poly = A_2 + r*A_3 +// rsq = r * r +// neg_2_mK = exponent of Bias_m_k +// +(p0) add r47 = r58, r44 ;; +// +// Set Safe = True if k <= big_expo_pos +// Set Safe = False if k > big_expo_pos +// Load A_3 +// +(p15) cmp.lt p8,p15 = r44,r48 ;; +} + +{ .mmf +(p0) setf.exp f61 = r46 +// +// Bias_p + K = Bias + K +// T = T_1 * T_2 +// +(p0) setf.exp f36 = r47 +(p0) fnma.s1 f42 = f41, f38, f42 ;; +} + +{ .mfi + nop.m 999 +// +// Load W_1,W_2 +// Load big_exp_pos, load big_exp_neg +// +(p0) fadd.s1 f47 = f43, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f51, f49 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 f53 = f44, f46 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f54 = f45, f47, f43 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fneg f61 = f61 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f52, f40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 f55 = f54, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// W + Wp1 * poly +// +(p0) mov f34 = f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// A_1 + r * poly +// Scale = setf_expf(Bias_p_k) +// +(p0) fma.s1 f52 = f48, f52, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly = r + rsq(A_1 + r*poly) +// Wp1 = 1 + W +// neg_2_mK = -neg_2_mK +// +(p0) fma.s1 f35 = f55, f52, f54 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fmpy.s1 f35 = f35, f53 +// +// Y_hi = T +// Y_lo = T * (W + Wp1*poly) +// +(p12) br.cond.sptk EXPF_MAIN ;; +} +// +// Branch if expf(x) +// Continue for expf(x-1) +// + +{ .mii +(p0) cmp.lt.unc p12, p13 = 10, r44 + nop.i 999 ;; +// +// Set p12 if 10 < K, Else p13 +// +(p13) cmp.gt.unc p13, p14 = -10, r44 ;; +} +// +// K > 10: Y_lo = Y_lo + neg_2_mK +// K <=10: Set p13 if -10 > K, Else set p14 +// + +{ .mfi +(p13) cmp.eq p15, p0 = r0, r0 +(p14) fadd.s1 f34 = f61, f34 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fadd.s1 f35 = f35, f61 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fadd.s1 f35 = f35, f34 + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// K <= 10 and K < -10, Set Safe = True +// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo +// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk +// +(p13) mov f34 = f61 +(p0) br.cond.sptk EXPF_MAIN ;; +} +EXPF_SMALL: +{ .mmi +(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp + nop.i 999 +} +;; + +{ .mmi +(p12) ld8 r35 = [r35] + ld8 r34 = [r34] + nop.i 999 +} +;; + + +{ .mmi +(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp + nop.m 999 + nop.i 999 +} +;; + + +// +// Return +// K <= 10 and K < 10, Y_hi = neg_2_mk +// +// /*******************************************************/ +// /*********** Branch EXP_SMALL *************************/ +// /*******************************************************/ + +{ .mfi +(p13) ld8 r35 = [r35] +(p0) mov f42 = f9 +(p0) add r34 = 0x48,r34 +} +;; + +// +// Flag = 0 +// r4 = rsq * rsq +// + +{ .mfi +(p0) ld8 r49 =[r34],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Flag = 1 +// +(p0) cmp.lt.unc p14, p0 = r37, r49 ;; +} + +{ .mfi + nop.m 999 +// +// r = X +// +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// rsq = r * r +// +(p0) fmpy.s1 f50 = f48, f48 +// +// Is input very small? +// +(p14) br.cond.spnt EXPF_VERY_SMALL ;; +} +// +// Flag_not1: Y_hi = 1.0 +// Flag is 1: r6 = rsq * r4 +// + +{ .mfi +(p12) ldfe f52 = [r35],16 +(p12) mov f34 = f1 +(p0) add r53 = 0x1,r0 ;; +} + +{ .mfi +(p13) ldfe f51 = [r35],16 +// +// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo +// +(p13) mov f34 = f9 + nop.i 999 ;; +} + +{ .mmf +(p12) ldfe f53 = [r35],16 +// +// For Flag_not_1, Y_hi = X +// Scale = 1 +// Create 0x000...01 +// +(p0) setf.sig f37 = r53 +(p0) mov f36 = f1 ;; +} + +{ .mmi +(p13) ldfe f52 = [r35],16 ;; +(p12) ldfe f54 = [r35],16 + nop.i 999 ;; +} + +{ .mfi +(p13) ldfe f53 = [r35],16 +(p13) fmpy.s1 f58 = f48, f50 + nop.i 999 ;; +} +// +// Flag_not1: poly_lo = P_5 + r*P_6 +// Flag_1: poly_lo = Q_6 + r*Q_7 +// + +{ .mmi +(p13) ldfe f54 = [r35],16 ;; +(p12) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f56 = [r35],16 ;; +(p13) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f57 = [r35],0 ;; +(p13) ldfe f56 = [r35],16 + nop.i 999 ;; +} + +{ .mfi +(p13) ldfe f57 = [r35],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For Flag_not_1, load p5,p6,p1,p2 +// Else load p5,p6,p1,p2 +// +(p12) fma.s1 f60 = f52, f42, f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f51, f42, f52 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f54 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f56, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f42, f60, f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f59, f48, f42 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) +// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) +// Flag_not1: poly_hi = (P_1 + r*P_2) +// +(p13) fmpy.s1 f60 = f60, f58 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f55 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = r6 *(Q_5 + ....) +// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) +// +(p12) fma.s1 f35 = f60, f50, f59 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f59 = f54, f42, f55 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not1: Y_lo = rsq* poly_hi + poly_lo +// Flag_1: poly_lo = rsq* poly_hi + poly_lo +// +(p13) fma.s1 f59 = f59, f42, f56 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: (P_1 + r*P_2) +// +(p13) fma.s1 f59 = f59, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) +// +(p13) fma.s1 f35 = f59, f48, f60 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Create 0.000...01 +// +(p0) for f37 = f35, f37 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Set lsb of Y_lo to 1 +// +(p0) fmerge.se f35 = f35,f37 +(p0) br.cond.sptk EXPF_MAIN ;; +} +EXPF_VERY_SMALL: + +{ .mmi + nop.m 999 +(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp + nop.i 999;; +} + +{ .mfi +(p13) ld8 r34 = [r34]; +(p12) mov f35 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p12) mov f34 = f1 +(p12) br.cond.sptk EXPF_MAIN ;; +} + +{ .mlx +(p13) add r34 = 8,r34 +(p13) movl r39 = 0x0FFFE ;; +} +// +// Load big_exp_neg +// Create 1/2's exponent +// + +{ .mii +(p13) setf.exp f56 = r39 +(p13) shladd r34 = r32,4,r34 ;; + nop.i 999 +} +// +// Negative exponents are stored after positive +// + +{ .mfi +(p13) ld8 r45 = [r34],0 +// +// Y_hi = x +// Scale = 1 +// +(p13) fmpy.s1 f35 = f9, f9 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Reset Safe if necessary +// Create 1/2 +// +(p13) mov f34 = f9 + nop.i 999 ;; +} + +{ .mfi +(p13) cmp.lt.unc p0, p15 = r37, r45 +(p13) mov f36 = f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Y_lo = x * x +// +(p13) fmpy.s1 f35 = f35, f56 +// +// Y_lo = x*x/2 +// +(p13) br.cond.sptk EXPF_MAIN ;; +} +EXPF_HUGE: + +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl r39 = 0x15DC0 ;; +} + +{ .mfi +(p14) setf.exp f34 = r39 +(p14) mov f35 = f1 +(p14) cmp.eq p0, p15 = r0, r0 ;; +} + +{ .mfb + nop.m 999 +(p14) mov f36 = f34 +// +// If x > 0, Set Safe = False +// If x > 0, Y_hi = 2**(24,000) +// If x > 0, Y_lo = 1.0 +// If x > 0, Scale = 2**(24,000) +// +(p14) br.cond.sptk EXPF_MAIN ;; +} + +{ .mlx + nop.m 999 +(p12) movl r39 = 0xA240 +} + +{ .mlx + nop.m 999 +(p12) movl r38 = 0xA1DC ;; +} + +{ .mmb +(p13) cmp.eq p15, p14 = r0, r0 +(p12) setf.exp f34 = r39 + nop.b 999 ;; +} + +{ .mlx +(p12) setf.exp f35 = r38 +(p13) movl r39 = 0xFF9C +} + +{ .mfi + nop.m 999 +(p13) fsub.s1 f34 = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) mov f36 = f34 +(p12) cmp.eq p0, p15 = r0, r0 ;; +} + +{ .mfi +(p13) setf.exp f35 = r39 +(p13) mov f36 = f1 + nop.i 999 ;; +} +EXPF_MAIN: + +{ .mfi +(p0) cmp.ne.unc p12, p0 = 0x01, r33 +(p0) fmpy.s1 f101 = f36, f35 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.s.s0 f99 = f34, f36, f101 +(p15) br.cond.sptk EXPF_64_RETURN ;; +} + +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl r50 = 0x0000000001007F ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + RZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// + +{ .mfi +(p0) setf.exp f60 = r50 +(p0) fma.s.s3 f102 = f34, f36, f101 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// For Safe, no need to check for over/under. +// For expm1, handle errors like exp. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.s.s2 f100 = f34, f36, f101 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p12, p0 = f102, 0x00F + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f102, 0x00F + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// +(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 + nop.i 999 ;; +} +// +// fcmp: resultS2 >= + overflow threshold -> set (a) if true +// fcmp: resultS2 <= - overflow threshold -> set (b) if true +// fclass: resultS3 is denorm/unorm/0 -> set (d) if true +// + +{ .mib +(p10) mov GR_Parameter_TAG = 43 + nop.i 999 +(p10) br.cond.sptk __libm_error_region ;; +} + +{ .mib +(p8) mov GR_Parameter_TAG = 16 + nop.i 999 +(p8) br.cond.sptk __libm_error_region ;; +} +// +// Report that exp overflowed +// + +{ .mib +(p12) mov GR_Parameter_TAG = 44 + nop.i 999 +(p12) br.cond.sptk __libm_error_region ;; +} + +{ .mib +(p11) mov GR_Parameter_TAG = 17 + nop.i 999 +(p11) br.cond.sptk __libm_error_region ;; +} + +{ .mib + nop.m 999 + nop.i 999 +// +// Report that exp underflowed +// +(p0) br.cond.sptk EXPF_64_RETURN ;; +} +EXPF_64_SPECIAL: + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = f8, 0x0c3 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p13, p8 = f8, 0x007 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p14, p0 = f8, 0x007 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p12, p9 = f8, 0x021 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f8, 0x022 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Identify +/- 0, Inf, or -Inf +// Generate the right kind of NaN. +// +(p13) fadd.s.s0 f99 = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) mov f99 = f8 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p6) fadd.s.s0 f99 = f8, f1 +// +// expf(+/-0) = 1 +// expm1f(+/-0) = +/-0 +// No exceptions raised +// +(p6) br.cond.sptk EXPF_64_RETURN ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.sptk EXPF_64_RETURN ;; +} + +{ .mfi + nop.m 999 +(p11) mov f99 = f0 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p10) fsub.s.s1 f99 = f0, f1 +// +// expf(-Inf) = 0 +// expm1f(-Inf) = -1 +// No exceptions raised. +// +(p10) br.cond.sptk EXPF_64_RETURN ;; +} + +{ .mfb + nop.m 999 +(p12) fmpy.s.s1 f99 = f8, f1 +// +// expf(+Inf) = Inf +// No exceptions raised. +// +(p0) br.cond.sptk EXPF_64_RETURN ;; +} +EXPF_64_UNSUPPORTED: + +{ .mfb + nop.m 999 +(p0) fmpy.s.s0 f99 = f8, f0 + nop.b 0;; +} + +EXPF_64_RETURN: +{ .mfb + nop.m 999 +(p0) mov f8 = f99 +(p0) br.ret.sptk b0 +} +.endp expm1f +ASM_SIZE_DIRECTIVE(expm1f) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_expm1l.S b/sysdeps/ia64/fpu/s_expm1l.S new file mode 100644 index 0000000000..a31910af5c --- /dev/null +++ b/sysdeps/ia64/fpu/s_expm1l.S @@ -0,0 +1,1603 @@ +.file "exp_m1l.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: Combined expl(x) and expm1l(x), where +// x +// expl(x) = e , for double-extended precision x values +// x +// expm1l(x) = e - 1 for double-extended precision x values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f32-f61, f99-f102 +// +// General Purpose Registers: +// r32-r61 +// r62-r65 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions raised when appropriate for exp and expm1 +// Underflow exceptions raised when appropriate for exp and expm1 +// (Error Handling Routine called for overflow and Underflow) +// Inexact raised when appropriate by algorithm +// +// expl(inf) = inf +// expl(-inf) = +0 +// expl(SNaN) = QNaN +// expl(QNaN) = QNaN +// expl(0) = 1 +// expl(EM_special Values) = QNaN +// expl(inf) = inf +// expm1l(-inf) = -1 +// expm1l(SNaN) = QNaN +// expm1l(QNaN) = QNaN +// expm1l(0) = 0 +// expm1l(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Implementation and Algorithm Notes: +// +// ker_exp_64( in_FR : X, +// in_GR : Flag, +// in_GR : Expo_Range +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : scale, +// out_PR : Safe ) +// +// On input, X is in register format and +// Flag = 0 for exp, +// Flag = 1 for expm1, +// +// On output, provided X and X_cor are real numbers, then +// +// scale*(Y_hi + Y_lo) approximates expl(X) if Flag is 0 +// scale*(Y_hi + Y_lo) approximates expl(X)-1 if Flag is 1 +// +// The accuracy is sufficient for a highly accurate 64 sig. +// bit implementation. Safe is set if there is no danger of +// overflow/underflow when the result is composed from scale, +// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. +// Otherwise, one must prepare to handle the possible exception +// appropriately. Note that SAFE not set (false) does not mean +// that overflow/underflow will occur; only the setting of SAFE +// guarantees the opposite. +// +// **** High Level Overview **** +// +// The method consists of three cases. +// +// If |X| < Tiny use case exp_tiny; +// else if |X| < 2^(-6) use case exp_small; +// else use case exp_regular; +// +// Case exp_tiny: +// +// 1 + X can be used to approximate expl(X) or expl(X+X_cor); +// X + X^2/2 can be used to approximate expl(X) - 1 +// +// Case exp_small: +// +// Here, expl(X), expl(X+X_cor), and expl(X) - 1 can all be +// appproximated by a relatively simple polynomial. +// +// This polynomial resembles the truncated Taylor series +// +// expl(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// +// Case exp_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute expl(X), we accurately decompose X into +// +// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. +// +// Hence +// +// expl(X) = 2^( N / 2^12 ) * expl(r). +// +// The value 2^( N / 2^12 ) is obtained by simple combinations +// of values calculated beforehand and stored in table; expl(r) +// is approximated by a short polynomial because |r| is small. +// +// We elaborate this method in 4 steps. +// +// Step 1: Reduction +// +// The value 2^12/log(2) is stored as a double-extended number +// L_Inv. +// +// N := round_to_nearest_integer( X * L_Inv ) +// +// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so +// that r can be computed accurately via +// +// r := (X - N*L_hi) - N*L_lo +// +// We pick L_hi such that N*L_hi is representable in 64 sig. bits +// and thus the FMA X - N*L_hi is error free. So r is the +// 1 rounding error from an exact reduction with respect to +// +// L_hi + L_lo. +// +// In particular, L_hi has 30 significant bit and can be stored +// as a double-precision number; L_lo has 64 significant bits and +// stored as a double-extended number. +// +// In the case Flag = 2, we further modify r by +// +// r := r + X_cor. +// +// Step 2: Approximation +// +// expl(r) - 1 is approximated by a short polynomial of the form +// +// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . +// +// Step 3: Composition from Table Values +// +// The value 2^( N / 2^12 ) can be composed from a couple of tables +// of precalculated values. First, express N as three integers +// K, M_1, and M_2 as +// +// N = K * 2^12 + M_1 * 2^6 + M_2 +// +// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. +// When N is represented in 2's complement, M_2 is simply the 6 +// lsb's, M_1 is the next 6, and K is simply N shifted right +// arithmetically (sign extended) by 12 bits. +// +// Now, 2^( N / 2^12 ) is simply +// +// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) +// +// Clearly, 2^K needs no tabulation. The other two values are less +// trivial because if we store each accurately to more than working +// precision, than its product is too expensive to calculate. We +// use the following method. +// +// Define two mathematical values, delta_1 and delta_2, implicitly +// such that +// +// T_1 = expl( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = expl( [M_2 log(2)/2^12] - delta_2 ) +// +// are representable as 24 significant bits. To illustrate the idea, +// we show how we define delta_1: +// +// T_1 := round_to_24_bits( expl( M_1 log(2)/2^6 ) ) +// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) +// +// The last equality means mathematical equality. We then tabulate +// +// W_1 := expl(delta_1) - 1 +// W_2 := expl(delta_2) - 1 +// +// Both in double precision. +// +// From the tabulated values T_1, T_2, W_1, W_2, we compose the values +// T and W via +// +// T := T_1 * T_2 ...exactly +// W := W_1 + (1 + W_1)*W_2 +// +// W approximates expl( delta ) - 1 where delta = delta_1 + delta_2. +// The mathematical product of T and (W+1) is an accurate representation +// of 2^(M_1/2^6) * 2^(M_2/2^12). +// +// Step 4. Reconstruction +// +// Finally, we can reconstruct expl(X), expl(X) - 1. +// Because +// +// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) +// + (M_2*log(2)/2^12 - delta_2) +// + delta_1 + delta_2 + r ...accurately +// We have +// +// expl(X) ~=~ 2^K * ( T + T*[expl(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[expl(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(expl(delta)-1) +// + expl(delta)*(expl(r)-1)] ) +// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) +// ~=~ 2^K * ( Y_hi + Y_lo ) +// +// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) +// +// For expl(X)-1, we have +// +// expl(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) +// +// and we combine Y_hi + Y_lo - 2^(-N) into the form of two +// numbers Y_hi + Y_lo carefully. +// +// **** Algorithm Details **** +// +// A careful algorithm must be used to realize the mathematical ideas +// accurately. We describe each of the three cases. We assume SAFE +// is preset to be TRUE. +// +// Case exp_tiny: +// +// The important points are to ensure an accurate result under +// different rounding directions and a correct setting of the SAFE +// flag. +// +// If Flag is 1, then +// SAFE := False ...possibility of underflow +// Scale := 1.0 +// Y_hi := X +// Y_lo := 2^(-17000) +// Else +// Scale := 1.0 +// Y_hi := 1.0 +// Y_lo := X ...for different rounding modes +// Endif +// +// Case exp_small: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into several portions. +// +// Let r = X +// +// If Flag is not 1 ...i.e. expl( argument ) +// +// rsq := r * r; +// r4 := rsq*rsq +// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) +// poly_hi := r + rsq*(P_1 + r*P_2) +// Y_lo := poly_hi + r4 * poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := 1.0 +// Scale := 1.0 +// +// Else ...i.e. expl( argument ) - 1 +// +// rsq := r * r +// r4 := rsq * rsq +// r6 := rsq * r4 +// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) +// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) +// Y_lo := rsq*poly_hi + poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := X +// Scale := 1.0 +// +// Endif +// +// Case exp_regular: +// +// The previous description contain enough information except the +// computation of poly and the final Y_hi and Y_lo in the case for +// expl(X)-1. +// +// The computation of poly for Step 2: +// +// rsq := r*r +// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) +// +// For the case expl(X) - 1, we need to incorporate 2^(-K) into +// Y_hi and Y_lo at the end of Step 4. +// +// If K > 10 then +// Y_lo := Y_lo - 2^(-K) +// Else +// If K < -10 then +// Y_lo := Y_hi + Y_lo +// Y_hi := -2^(-K) +// Else +// Y_hi := Y_hi - 2^(-K) +// End If +// End If +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 64 +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +// /* Inv_L, L_hi, L_lo */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) + +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) + +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) + +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) + +.align 64 +Constants_exp_64_Q: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) +data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 +data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 +data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 +data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 +data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x00000000,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) + +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 +data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC +data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D +data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA +data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 +data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A +data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 +data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B +data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD +data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 +data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B +data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 +data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A +data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 +data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) + +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) + +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) + +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) + +GR_SAVE_PFS = r59 +GR_SAVE_B0 = r60 +GR_SAVE_GP = r61 +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 + +FR_X = f9 +FR_Y = f9 +FR_RESULT = f99 + +.section .text +.proc expm1l# +.global expm1l# +.align 64 +expm1l: +#ifdef _LIBC +.global __expm1l# +__expm1l: +#endif +{ .mii +alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = 1, r0 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} +{ .mbb + nop.m 999 +(p0) br.cond.sptk exp_continue + nop.b 999 ;; +} + +// +// Set p7 true for expm1 +// Set Flag = r33 = 1 for expm1 +// + +.endp expm1l +ASM_SIZE_DIRECTIVE(expm1l) + +.section .text +.proc expl# +.global expl# +.align 64 +expl: +#ifdef _LIBC +.global __ieee754_expl# +__ieee754_expl: +#endif +{ .mii +alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = r0, r0 +(p0) cmp.eq.unc p0, p7 = r0, r0 ;; +} +exp_continue: +{ .mfi +(p0) add r32 = 2,r0 +(p0) fnorm.s1 f9 = f8 + nop.i 0 +} +{ .mfi +(p0) nop.m 0 +// +// Set p7 false for exp +// Set Flag = r33 = 0 for exp +// +(p0) fclass.m.unc p6, p8 = f8, 0x1E7 + nop.i 0;; +} +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p9, p0 = f8, 0x1FF + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) mov f36 = f1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Identify EM unsupporteds. +// Save special input registers +(p0) mov f32 = f0 +// +// Create FR_X_cor = 0.0 +// GR_Flag = 0 +// GR_Expo_Range = 2 (r32) for double-extended precision +// FR_Scale = 1.0 +// +(p6) br.cond.spnt EXPL_64_SPECIAL ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt EXPL_64_UNSUPPORTED ;; +} +{ .mfi +(p0) cmp.ne.unc p12, p13 = 0x01, r33 +// +// Branch out for special input values +// +(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 +(p0) cmp.eq.unc p15, p0 = r0, r0 +} +{ .mmi + nop.m 999 +// +// Raise possible denormal operand exception +// Normalize x +// +// This function computes expl( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe +(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp +(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp +};; +// +// Prepare to load constants +// Set Safe = True +// + +{ .mmi + ld8 r34 = [r34] + ld8 r40 = [r40] +(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp +};; + +{ .mmi +(p0) ldfe f37 = [r34],16 +(p0) ld8 r41 = [r41] ;; +} + +// +// N = fcvt.fx(float_N) +// Set p14 if -6 > expo_X +// +// +// Bias = 0x0FFFF +// expo_X = expo_X and Mask +// + +{ .mmi +(p0) ldfe f40 = [r34],16 + nop.m 999 +// +// Load L_lo +// Set p10 if 14 < expo_X +// +(p0) addl r50 = @ltoff(Constants_exp_64_T1#),gp +} +{ .mmi + nop.m 999 + nop.m 999 +(p0) addl r51 = @ltoff(Constants_exp_64_T2#),gp ;; +} +// +// Load W2_ptr +// Branch to SMALL is expo_X < -6 +// + +{.mmi +(p0) ld8 r50 = [r50] +(p0) ld8 r51 = [r51] +};; + +{ .mlx +(p0) ldfe f41 = [r34],16 +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// +(p0) movl r58 = 0x0FFFF +} +{ .mlx + nop.m 999 +(p0) movl r39 = 0x1FFFF ;; +} +{ .mmi +(p0) getf.exp r37 = f9 + nop.m 999 +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp ;; +} +{ .mii +(p0) ld8 r34 = [r34] + nop.i 999 +(p0) and r37 = r37, r39 ;; +} +{ .mmi +(p0) sub r37 = r37, r58 ;; +(p0) cmp.gt.unc p14, p0 = -6, r37 +(p0) cmp.lt.unc p10, p0 = 14, r37 ;; +} +{ .mfi +(p0) nop.m 0 +// +// Load L_inv +// Set p12 true for Flag = 0 (exp) +// Set p13 true for Flag = 1 (expm1) +// +(p0) fmpy.s1 f38 = f9, f37 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Load L_hi +// expo_X = expo_X - Bias +// get W1_ptr +// +(p0) fcvt.fx.s1 f39 = f38 +(p14) br.cond.spnt EXPL_SMALL ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt EXPL_HUGE ;; +} +{ .mmi +(p0) shladd r34 = r32,4,r34 + nop.m 999 +(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp ;; +} +// +// Load T_1,T_2 +// +{ .mmi + nop.m 999 + ld8 r35 =[r35] + nop.i 99 +};; +{ .mmb +(p0) ldfe f51 = [r35],16 +(p0) ld8 r45 = [r34],8 + nop.b 999 ;; +} +// +// Set Safe = True if k >= big_expo_neg +// Set Safe = False if k < big_expo_neg +// +{ .mmb +(p0) ldfe f49 = [r35],16 +(p0) ld8 r48 = [r34],0 + nop.b 999 ;; +} +{ .mfi + nop.m 999 +// +// Branch to HUGE is expo_X > 14 +// +(p0) fcvt.xf f38 = f39 + nop.i 999 ;; +} +{ .mfi +(p0) getf.sig r52 = f39 + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 +(p0) extr.u r43 = r52, 6, 6 ;; +// +// r = r - float_N * L_lo +// K = extr(N_fix,12,52) +// +(p0) shladd r40 = r43,3,r40 ;; +} +{ .mfi +(p0) shladd r50 = r43,2,r50 +(p0) fnma.s1 f42 = f40, f38, f9 +// +// float_N = float(N) +// N_fix = signficand N +// +(p0) extr.u r42 = r52, 0, 6 +} +{ .mmi +(p0) ldfd f43 = [r40],0 ;; +(p0) shladd r41 = r42,3,r41 +(p0) shladd r51 = r42,2,r51 +} +// +// W_1_p1 = 1 + W_1 +// +{ .mmi +(p0) ldfs f44 = [r50],0 ;; +(p0) ldfd f45 = [r41],0 +// +// M_2 = extr(N_fix,0,6) +// M_1 = extr(N_fix,6,6) +// r = X - float_N * L_hi +// +(p0) extr r44 = r52, 12, 52 +} +{ .mmi +(p0) ldfs f46 = [r51],0 ;; +(p0) sub r46 = r58, r44 +(p0) cmp.gt.unc p8, p15 = r44, r45 +} +// +// W = W_1 + W_1_p1*W_2 +// Load A_2 +// Bias_m_K = Bias - K +// +{ .mii +(p0) ldfe f40 = [r35],16 +// +// load A_1 +// poly = A_2 + r*A_3 +// rsq = r * r +// neg_2_mK = exponent of Bias_m_k +// +(p0) add r47 = r58, r44 ;; +// +// Set Safe = True if k <= big_expo_pos +// Set Safe = False if k > big_expo_pos +// Load A_3 +// +(p15) cmp.lt p8,p15 = r44,r48 ;; +} +{ .mmf +(p0) setf.exp f61 = r46 +// +// Bias_p + K = Bias + K +// T = T_1 * T_2 +// +(p0) setf.exp f36 = r47 +(p0) fnma.s1 f42 = f41, f38, f42 ;; +} +{ .mfi + nop.m 999 +// +// Load W_1,W_2 +// Load big_exp_pos, load big_exp_neg +// +(p0) fadd.s1 f47 = f43, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f51, f49 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 f53 = f44, f46 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 f54 = f45, f47, f43 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fneg f61 = f61 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f52, f40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 f55 = f54, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// W + Wp1 * poly +// +(p0) mov f34 = f53 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// A_1 + r * poly +// Scale = setf_expl(Bias_p_k) +// +(p0) fma.s1 f52 = f48, f52, f42 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly = r + rsq(A_1 + r*poly) +// Wp1 = 1 + W +// neg_2_mK = -neg_2_mK +// +(p0) fma.s1 f35 = f55, f52, f54 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) fmpy.s1 f35 = f35, f53 +// +// Y_hi = T +// Y_lo = T * (W + Wp1*poly) +// +(p12) br.cond.sptk EXPL_MAIN ;; +} +// +// Branch if expl(x) +// Continue for expl(x-1) +// +{ .mii +(p0) cmp.lt.unc p12, p13 = 10, r44 + nop.i 999 ;; +// +// Set p12 if 10 < K, Else p13 +// +(p13) cmp.gt.unc p13, p14 = -10, r44 ;; +} +// +// K > 10: Y_lo = Y_lo + neg_2_mK +// K <=10: Set p13 if -10 > K, Else set p14 +// +{ .mfi +(p13) cmp.eq p15, p0 = r0, r0 +(p14) fadd.s1 f34 = f61, f34 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fadd.s1 f35 = f35, f61 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fadd.s1 f35 = f35, f34 + nop.i 999 +} +{ .mfb + nop.m 999 +// +// K <= 10 and K < -10, Set Safe = True +// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo +// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk +// +(p13) mov f34 = f61 +(p0) br.cond.sptk EXPL_MAIN ;; +} +EXPL_SMALL: +{ .mmi + nop.m 999 +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp +(p12) addl r35 = @ltoff(Constants_exp_64_P#),gp ;; +} +.pred.rel "mutex",p12,p13 +{ .mmi +(p12) ld8 r35=[r35] +nop.m 999 +(p13) addl r35 = @ltoff(Constants_exp_64_Q#),gp +};; +{ .mmi +(p13) ld8 r35=[r35] +(p0) ld8 r34=[r34] +nop.i 999 +};; +{ .mfi +(p0) add r34 = 0x48,r34 +// +// Return +// K <= 10 and K < 10, Y_hi = neg_2_mk +// +// /*******************************************************/ +// /*********** Branch EXPL_SMALL ************************/ +// /*******************************************************/ +(p0) mov f42 = f9 + nop.i 999 ;; +} +// +// Flag = 0 +// r4 = rsq * rsq +// +{ .mfi +(p0) ld8 r49 =[r34],0 + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Flag = 1 +// +(p0) cmp.lt.unc p14, p0 = r37, r49 ;; +} +{ .mfi + nop.m 999 +// +// r = X +// +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// rsq = r * r +// +(p0) fmpy.s1 f50 = f48, f48 +// +// Is input very small? +// +(p14) br.cond.spnt EXPL_VERY_SMALL ;; +} +// +// Flag_not1: Y_hi = 1.0 +// Flag is 1: r6 = rsq * r4 +// +{ .mfi +(p12) ldfe f52 = [r35],16 +(p12) mov f34 = f1 +(p0) add r53 = 0x1,r0 ;; +} +{ .mfi +(p13) ldfe f51 = [r35],16 +// +// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo +// +(p13) mov f34 = f9 + nop.i 999 ;; +} +{ .mmf +(p12) ldfe f53 = [r35],16 +// +// For Flag_not_1, Y_hi = X +// Scale = 1 +// Create 0x000...01 +// +(p0) setf.sig f37 = r53 +(p0) mov f36 = f1 ;; +} +{ .mmi +(p13) ldfe f52 = [r35],16 ;; +(p12) ldfe f54 = [r35],16 + nop.i 999 ;; +} +{ .mfi +(p13) ldfe f53 = [r35],16 +(p13) fmpy.s1 f58 = f48, f50 + nop.i 999 ;; +} +// +// Flag_not1: poly_lo = P_5 + r*P_6 +// Flag_1: poly_lo = Q_6 + r*Q_7 +// +{ .mmi +(p13) ldfe f54 = [r35],16 ;; +(p12) ldfe f55 = [r35],16 + nop.i 999 ;; +} +{ .mmi +(p12) ldfe f56 = [r35],16 ;; +(p13) ldfe f55 = [r35],16 + nop.i 999 ;; +} +{ .mmi +(p12) ldfe f57 = [r35],0 ;; +(p13) ldfe f56 = [r35],16 + nop.i 999 ;; +} +{ .mfi +(p13) ldfe f57 = [r35],0 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For Flag_not_1, load p5,p6,p1,p2 +// Else load p5,p6,p1,p2 +// +(p12) fma.s1 f60 = f52, f42, f53 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f51, f42, f52 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f54 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f56, f42, f57 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f42, f60, f53 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 f59 = f59, f48, f42 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) +// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) +// Flag_not1: poly_hi = (P_1 + r*P_2) +// +(p13) fmpy.s1 f60 = f60, f58 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f55 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = r6 *(Q_5 + ....) +// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) +// +(p12) fma.s1 f35 = f60, f50, f59 + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fma.s1 f59 = f54, f42, f55 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Flag_not1: Y_lo = rsq* poly_hi + poly_lo +// Flag_1: poly_lo = rsq* poly_hi + poly_lo +// +(p13) fma.s1 f59 = f59, f42, f56 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Flag_not_1: (P_1 + r*P_2) +// +(p13) fma.s1 f59 = f59, f42, f57 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) +// +(p13) fma.s1 f35 = f59, f48, f60 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Create 0.000...01 +// +(p0) for f37 = f35, f37 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Set lsb of Y_lo to 1 +// +(p0) fmerge.se f35 = f35,f37 +(p0) br.cond.sptk EXPL_MAIN ;; +} +EXPL_VERY_SMALL: +{ .mmi + nop.m 999 + nop.m 999 +(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp +} +{ .mfi + nop.m 999 +(p12) mov f35 = f9 + nop.i 999 ;; +} +{ .mfb +(p13) ld8 r34 = [r34] +(p12) mov f34 = f1 +(p12) br.cond.sptk EXPL_MAIN ;; +} +{ .mlx +(p13) add r34 = 8,r34 +(p13) movl r39 = 0x0FFFE ;; +} +// +// Load big_exp_neg +// Create 1/2's exponent +// +{ .mii +(p13) setf.exp f56 = r39 +(p13) shladd r34 = r32,4,r34 ;; + nop.i 999 +} +// +// Negative exponents are stored after positive +// +{ .mfi +(p13) ld8 r45 = [r34],0 +// +// Y_hi = x +// Scale = 1 +// +(p13) fmpy.s1 f35 = f9, f9 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Reset Safe if necessary +// Create 1/2 +// +(p13) mov f34 = f9 + nop.i 999 ;; +} +{ .mfi +(p13) cmp.lt.unc p0, p15 = r37, r45 +(p13) mov f36 = f1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Y_lo = x * x +// +(p13) fmpy.s1 f35 = f35, f56 +// +// Y_lo = x*x/2 +// +(p13) br.cond.sptk EXPL_MAIN ;; +} +EXPL_HUGE: +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl r39 = 0x15DC0 ;; +} +{ .mfi +(p14) setf.exp f34 = r39 +(p14) mov f35 = f1 +(p14) cmp.eq p0, p15 = r0, r0 ;; +} +{ .mfb + nop.m 999 +(p14) mov f36 = f34 +// +// If x > 0, Set Safe = False +// If x > 0, Y_hi = 2**(24,000) +// If x > 0, Y_lo = 1.0 +// If x > 0, Scale = 2**(24,000) +// +(p14) br.cond.sptk EXPL_MAIN ;; +} +{ .mlx + nop.m 999 +(p12) movl r39 = 0xA240 +} +{ .mlx + nop.m 999 +(p12) movl r38 = 0xA1DC ;; +} +{ .mmb +(p13) cmp.eq p15, p14 = r0, r0 +(p12) setf.exp f34 = r39 + nop.b 999 ;; +} +{ .mlx +(p12) setf.exp f35 = r38 +(p13) movl r39 = 0xFF9C +} +{ .mfi + nop.m 999 +(p13) fsub.s1 f34 = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) mov f36 = f34 +(p12) cmp.eq p0, p15 = r0, r0 ;; +} +{ .mfi +(p13) setf.exp f35 = r39 +(p13) mov f36 = f1 + nop.i 999 ;; +} +EXPL_MAIN: +{ .mfi +(p0) cmp.ne.unc p12, p0 = 0x01, r33 +(p0) fmpy.s1 f101 = f36, f35 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) fma.s0 f99 = f34, f36, f101 +(p15) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl r50 = 0x00000000013FFF ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + RZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// +{ .mfi +(p0) setf.exp f60 = r50 +(p0) fma.s3 f102 = f34, f36, f101 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// For Safe, no need to check for over/under. +// For expm1, handle errors like exp. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} +{ .mfi + nop.m 999 +(p0) fma.s2 f100 = f34, f36, f101 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p12, p0 = f102, 0x00F + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f102, 0x00F + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// +(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 + nop.i 999 ;; +} +// +// fcmp: resultS2 >= + overflow threshold -> set (a) if true +// fcmp: resultS2 <= - overflow threshold -> set (b) if true +// fclass: resultS3 is denorm/unorm/0 -> set (d) if true +// +{ .mib +(p10) mov GR_Parameter_TAG = 39 + nop.i 999 +(p10) br.cond.sptk __libm_error_region ;; +} +{ .mib +(p8) mov GR_Parameter_TAG = 12 + nop.i 999 +(p8) br.cond.sptk __libm_error_region ;; +} +// +// Report that exp overflowed +// +{ .mib +(p12) mov GR_Parameter_TAG = 40 + nop.i 999 +(p12) br.cond.sptk __libm_error_region ;; +} +{ .mib +(p11) mov GR_Parameter_TAG = 13 + nop.i 999 +(p11) br.cond.sptk __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Report that exp underflowed +// +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_SPECIAL: +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = f8, 0x0c3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p13, p8 = f8, 0x007 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p14, p0 = f8, 0x007 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p12, p9 = f8, 0x021 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f8, 0x022 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Identify +/- 0, Inf, or -Inf +// Generate the right kind of NaN. +// +(p13) fadd.s0 f99 = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) mov f99 = f8 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p6) fadd.s0 f99 = f8, f1 +// +// expl(+/-0) = 1 +// expm1l(+/-0) = +/-0 +// No exceptions raised +// +(p6) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mfi + nop.m 999 +(p11) mov f99 = f0 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p10) fsub.s1 f99 = f0, f1 +// +// expl(-Inf) = 0 +// expm1l(-Inf) = -1 +// No exceptions raised. +// +(p10) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mfb + nop.m 999 +(p12) fmpy.s1 f99 = f8, f1 +// +// expl(+Inf) = Inf +// No exceptions raised. +// +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_UNSUPPORTED: +{ .mfb + nop.m 999 +(p0) fmpy.s0 f99 = f8, f0 +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_RETURN: +{ .mfb + nop.m 999 +(p0) mov f8 = f99 +(p0) br.ret.sptk b0 +} +.endp +ASM_SIZE_DIRECTIVE(expl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_floor.S b/sysdeps/ia64/fpu/s_floor.S new file mode 100644 index 0000000000..5a63a3c263 --- /dev/null +++ b/sysdeps/ia64/fpu/s_floor.S @@ -0,0 +1,227 @@ +.file "floor.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global floor# + +.section .text +.proc floor# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 3/22/00: Updated to improve performance +// 6/13/00: Improved speed, fixed setting of inexact flag +// 6/27/00: Eliminated incorrect invalid flag setting +// 2/07/01: Corrected sign of zero result in round to -inf mode + +// API +//============================================================== +// double floor(double x) + +// general input registers: + +floor_GR_FFFF = r14 +floor_GR_signexp = r15 +floor_GR_exponent = r16 +floor_GR_expmask = r17 +floor_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set + + +// floating-point registers used: + +FLOOR_NORM_f8 = f9 +FLOOR_FFFF = f10 +FLOOR_INEXACT = f11 +FLOOR_FLOAT_INT_f8 = f12 +FLOOR_INT_f8 = f13 +FLOOR_adj = f14 + +// Overview of operation +//============================================================== + +// double floor(double x) +// Return an integer value (represented as a double) that is the largest +// value not greater than x +// This is x rounded toward -infinity to an integral value. +// Inexact is set if x != floor(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + +#include "libm_support.h" + +floor: +#ifdef _LIBC +.global __floor +__floor: +#endif + +{ .mfi + getf.exp floor_GR_signexp = f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 + addl floor_GR_bigexp = 0x10033, r0 +} +{ .mfi + addl floor_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov floor_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig FLOOR_FFFF = floor_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FLOOR_NORM_f8 = f8 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(FLOOR_DENORM) ;; +} + +L(FLOOR_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to subtract from trunc(x) for result +// If x<0, adjustment is -1.0 +// If x>=0, adjustment is 0.0 +{ .mfi + and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask +(p8) fnma.s1 FLOOR_adj = f1,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fadd.s1 FLOOR_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag + nop.i 999 +} +{ .mfi +(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp +(p6) fnorm.d f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm.d f8 = FLOOR_NORM_f8 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p11) fadd.d f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm.d f8 = FLOOR_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(FLOOR_DENORM): +{ .mfb + getf.exp floor_GR_signexp = FLOOR_NORM_f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 + br.cond.sptk L(FLOOR_COMMON) ;; +} + +.endp floor +ASM_SIZE_DIRECTIVE(floor) diff --git a/sysdeps/ia64/fpu/s_floorf.S b/sysdeps/ia64/fpu/s_floorf.S new file mode 100644 index 0000000000..92d58f147d --- /dev/null +++ b/sysdeps/ia64/fpu/s_floorf.S @@ -0,0 +1,224 @@ +.file "floorf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global floorf# + +.section .text +.proc floorf# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 6/13/00: Improved speed +// 6/27/00: Eliminated incorrect invalid flag setting +// 2/07/01: Corrected sign of zero result in round to -inf mode + +// API +//============================================================== +// float floorf(float x) + +// general input registers: + +floor_GR_FFFF = r14 +floor_GR_signexp = r15 +floor_GR_exponent = r16 +floor_GR_expmask = r17 +floor_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set + + +// floating-point registers used: + +FLOOR_NORM_f8 = f9 +FLOOR_FFFF = f10 +FLOOR_INEXACT = f11 +FLOOR_FLOAT_INT_f8 = f12 +FLOOR_INT_f8 = f13 +FLOOR_adj = f14 + +// Overview of operation +//============================================================== + +// float floorf(float x) +// Return an integer value (represented as a float) that is the largest +// value not greater than x +// This is x rounded toward -infinity to an integral value. +// Inexact is set if x != floorf(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + +#include "libm_support.h" + +floorf: +#ifdef _LIBC +.global __floorf +__floorf: +#endif + +{ .mfi + getf.exp floor_GR_signexp = f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 + addl floor_GR_bigexp = 0x10016, r0 +} +{ .mfi + addl floor_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov floor_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig FLOOR_FFFF = floor_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FLOOR_NORM_f8 = f8 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(FLOOR_DENORM) ;; +} + +L(FLOOR_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to subtract from trunc(x) for result +// If x<0, adjustment is -1.0 +// If x>=0, adjustment is 0.0 +{ .mfi + and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask +(p8) fnma.s1 FLOOR_adj = f1,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fadd.s1 FLOOR_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag + nop.i 999 +} +{ .mfi +(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp +(p6) fnorm.s f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm.s f8 = FLOOR_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd.s f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm.s f8 = FLOOR_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(FLOOR_DENORM): +{ .mfb + getf.exp floor_GR_signexp = FLOOR_NORM_f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 + br.cond.sptk L(FLOOR_COMMON) ;; +} + +.endp floorf +ASM_SIZE_DIRECTIVE(floorf) diff --git a/sysdeps/ia64/fpu/s_floorl.S b/sysdeps/ia64/fpu/s_floorl.S new file mode 100644 index 0000000000..241b2ef5b0 --- /dev/null +++ b/sysdeps/ia64/fpu/s_floorl.S @@ -0,0 +1,224 @@ +.file "floorl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global floorl# + +.section .text +.proc floorl# +.align 32 + +// History +//============================================================== +// 2/02/00: Initial version +// 6/13/00: Improved speed +// 6/27/00: Eliminated incorrect invalid flag setting +// 2/07/01: Corrected sign of zero result in round to -inf mode + +// API +//============================================================== +// long double floorl(long double x) + +// general input registers: + +floor_GR_FFFF = r14 +floor_GR_signexp = r15 +floor_GR_exponent = r16 +floor_GR_expmask = r17 +floor_GR_bigexp = r18 + + +// predicate registers used: + +// p6 ==> Input is NaN, infinity, zero +// p7 ==> Input is denormal +// p8 ==> Input is <0 +// p9 ==> Input is >=0 +// p10 ==> Input is already an integer (bigger than largest integer) +// p11 ==> Input is not a large integer +// p12 ==> Input is a smaller integer +// p13 ==> Input is not an even integer, so inexact must be set + + +// floating-point registers used: + +FLOOR_NORM_f8 = f9 +FLOOR_FFFF = f10 +FLOOR_INEXACT = f11 +FLOOR_FLOAT_INT_f8 = f12 +FLOOR_INT_f8 = f13 +FLOOR_adj = f14 + +// Overview of operation +//============================================================== + +// long double floorl(long double x) +// Return an integer value (represented as a long double) that is the largest +// value not greater than x +// This is x rounded toward -infinity to an integral value. +// Inexact is set if x != floorl(x) +// ************************************************************************** + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + +#include "libm_support.h" + +floorl: +#ifdef _LIBC +.global __floorl +__floorl: +#endif + +{ .mfi + getf.exp floor_GR_signexp = f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 + addl floor_GR_bigexp = 0x1003e, r0 +} +{ .mfi + addl floor_GR_FFFF = -1,r0 + fcmp.lt.s1 p8,p9 = f8,f0 + mov floor_GR_expmask = 0x1FFFF ;; +} + +// p7 ==> denorm +{ .mfi + setf.sig FLOOR_FFFF = floor_GR_FFFF + fclass.m p7,p0 = f8, 0x0b + nop.i 999 +} +{ .mfi + nop.m 999 + fnorm.s1 FLOOR_NORM_f8 = f8 + nop.i 999 ;; +} + +// p6 ==> NAN, INF, ZERO +{ .mfb + nop.m 999 + fclass.m p6,p10 = f8, 0xe7 +(p7) br.cond.spnt L(FLOOR_DENORM) ;; +} + +L(FLOOR_COMMON): +.pred.rel "mutex",p8,p9 +// Set adjustment to subtract from trunc(x) for result +// If x<0, adjustment is -1.0 +// If x>=0, adjustment is 0.0 +{ .mfi + and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask +(p8) fnma.s1 FLOOR_adj = f1,f1,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fadd.s1 FLOOR_adj = f0,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 + fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag + nop.i 999 +} +{ .mfi +(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp +(p6) fnorm f8 = f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p10) fnorm f8 = FLOOR_NORM_f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p11) fadd f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 + nop.i 999 ;; +} + +// Set inexact if result not equal to input +{ .mfi + nop.m 999 +(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF + nop.i 999 +} +// Set result to input if integer +{ .mfb + nop.m 999 +(p12) fnorm f8 = FLOOR_NORM_f8 + br.ret.sptk b0 ;; +} + +// Here if input denorm +L(FLOOR_DENORM): +{ .mfb + getf.exp floor_GR_signexp = FLOOR_NORM_f8 + fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 + br.cond.sptk L(FLOOR_COMMON) ;; +} + +.endp floorl +ASM_SIZE_DIRECTIVE(floorl) diff --git a/sysdeps/ia64/fpu/s_frexp.c b/sysdeps/ia64/fpu/s_frexp.c new file mode 100644 index 0000000000..752a9eec7e --- /dev/null +++ b/sysdeps/ia64/fpu/s_frexp.c @@ -0,0 +1,44 @@ +// +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// + +#include "libm_support.h" + +double frexp(double x, int *y) +{ + +#ifdef SIZE_INT_64 + return( __libm_frexp_8(x, y) ); + +#else + +#ifdef SIZE_INT_32 + return( __libm_frexp_4(x, y) ); +#endif + +#endif + +} diff --git a/sysdeps/ia64/fpu/s_frexpf.c b/sysdeps/ia64/fpu/s_frexpf.c new file mode 100644 index 0000000000..9bbe51d77a --- /dev/null +++ b/sysdeps/ia64/fpu/s_frexpf.c @@ -0,0 +1,44 @@ +// +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// + +#include "libm_support.h" + +float frexpf(float x, int *y) +{ + +#ifdef SIZE_INT_64 + return( __libm_frexp_8f(x, y) ); + +#else + +#ifdef SIZE_INT_32 + return( __libm_frexp_4f(x, y) ); +#endif + +#endif + +} diff --git a/sysdeps/ia64/fpu/s_frexpl.c b/sysdeps/ia64/fpu/s_frexpl.c new file mode 100644 index 0000000000..b85a7791d2 --- /dev/null +++ b/sysdeps/ia64/fpu/s_frexpl.c @@ -0,0 +1,44 @@ +// +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// + +#include "libm_support.h" + +long double frexpl(long double x, int *y) +{ + +#ifdef SIZE_INT_64 + return( __libm_frexp_8l(x, y) ); + +#else + +#ifdef SIZE_INT_32 + return( __libm_frexp_4l(x, y) ); +#endif + +#endif + +} diff --git a/sysdeps/ia64/fpu/s_ilogb.S b/sysdeps/ia64/fpu/s_ilogb.S new file mode 100644 index 0000000000..d860ace598 --- /dev/null +++ b/sysdeps/ia64/fpu/s_ilogb.S @@ -0,0 +1,240 @@ +.file "ilogb.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/03/00 Initial version +// 5/26/00 Fix bug when x a double-extended denormal; +// if x=0 call error routine, per C9X +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/20/01 Fixed result for x=0, corrected error tag value. + +.align 32 +.global ilogb# + +.section .text +.proc ilogb# +.align 32 + +// API +//============================================================== +// int = ilogb(double) + +// Overview of operation +//============================================================== +// ilogb computes log2(x) as an int +// and returns it in r8 + +// ilogb is similar to logb but differs in the following ways: +// +-inf +// ilogb: returns INT_MAX +// logb: returns +inf +// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// ilogb: returns INT_MAX (7fffffff) +// logb: returns QNAN (quieted SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogb: returns INT_MIN (80000000) +// logb: returns -inf + +// Registers used +//============================================================== + +// general local registers: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path + +// predicate registers used: +// p6 - x nan, inf +// p7 - x 0 +// p8 - x norm, unorm +// p9 - x unorm + +// floating-point registers used: +// f8 - f10 + +#include "libm_support.h" + +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f0 + + +ilogb: + +// Form signexp of 2^64 in case need to scale denormal +{ .mmf + alloc r32=ar.pfs,1,5,4,0 +(p0) mov r37 = 0x1003f +(p0) fnorm f9 = f8 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f10 = r37 +(p0) fclass.m.unc p7, p8 = f8, 0xe3 +(p0) mov r34 = 0xffff ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X ZERO, returns INT_MIN +// X INF or NAN, returns INT_MAX + +{ .mfi +(p0) mov r35 = 0x1ffff +(p8) fclass.m.unc p6, p8 = f8, 0x07 + nop.i 999 ;; +} +{ .mlx + nop.m 999 +(p7) movl r8 = 0x000000007fffffff ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(ILOGB_ZERO) ;; +} + +// Test for denormal +{ .mfi + nop.m 999 +(p8) fclass.m.unc p9, p0 = f9, 0x0b + nop.i 999 ;; +} + +L(ILOGB_COMMON): +// X NORMAL returns true exponent +{ .mmi + nop.m 999 +(p8) getf.exp r33 = f9 + nop.i 999 ;; +} + +// If denormal add 64 to exponent bias for scaling +{ .mfb +(p9) add r34 = 64, r34 + nop.f 999 +(p9) br.cond.spnt L(ILOGB_DENORM) ;; +} + +{ .mmi +(p8) and r36 = r35, r33 + nop.m 999 + nop.i 999 ;; +} + +{ .mib +(p8) sub r8 = r36, r34 + nop.i 999 +(p0) br.ret.sptk b0 ;; +} + +L(ILOGB_DENORM): +// Here if x denormal +// Form x * 2^64 which is normal +// Return to common code +{ .mfb + cmp.eq p8,p9 = r0,r0 + fmpy f9 = f9, f10 + br.cond.sptk L(ILOGB_COMMON) ;; +} + +// X ZERO +// return INT_MIN, call error support +L(ILOGB_ZERO): +{.mlx + mov GR_Parameter_TAG = 157 +(p6) movl r33 = 0x0000000080000000 ;; +};; +.endp ilogb +ASM_SIZE_DIRECTIVE(ilogb) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + mov r8 = r33 // Store result +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ilogbf.S b/sysdeps/ia64/fpu/s_ilogbf.S new file mode 100644 index 0000000000..0fb4d45388 --- /dev/null +++ b/sysdeps/ia64/fpu/s_ilogbf.S @@ -0,0 +1,240 @@ +.file "ilogbf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/03/00 Initial version +// 5/26/00 Fix bug when x a double-extended denormal; +// if x=0 call error routine, per C9X +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/20/01 Fixed result for x=0 + +.align 32 +.global ilogbf# + +.section .text +.proc ilogbf# +.align 32 + +// API +//============================================================== +// int = ilogbf(float) + +// Overview of operation +//============================================================== +// ilogbf computes log2(x) as an int +// and returns it in r8 + +// ilogbf is similar to logbf but differs in the following ways: +// +-inf +// ilogbf: returns INT_MAX +// logbf: returns +inf +// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// ilogbf: returns INT_MAX (7fffffff) +// logbf: returns QNAN (quieted SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogbf: returns INT_MIN (80000000) +// logbf: returns -inf + +// Registers used +//============================================================== + +// general local registers: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path + +// predicate registers used: +// p6 - x nan, inf +// p7 - x 0 +// p8 - x norm, unorm +// p9 - x unorm + +// floating-point registers used: +// f8 - f10 + +#include "libm_support.h" + +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f0 + + +ilogbf: + +// Form signexp of 2^64 in case need to scale denormal +{ .mmf + alloc r32=ar.pfs,1,5,4,0 +(p0) mov r37 = 0x1003f +(p0) fnorm f9 = f8 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f10 = r37 +(p0) fclass.m.unc p7, p8 = f8, 0xe3 +(p0) mov r34 = 0xffff ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X ZERO, returns INT_MIN +// X INF or NAN, returns INT_MAX + +{ .mfi +(p0) mov r35 = 0x1ffff +(p8) fclass.m.unc p6, p8 = f8, 0x07 + nop.i 999 ;; +} +{ .mlx + nop.m 999 +(p7) movl r8 = 0x000000007fffffff ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(ILOGB_ZERO) ;; +} + +// Test for denormal +{ .mfi + nop.m 999 +(p8) fclass.m.unc p9, p0 = f9, 0x0b + nop.i 999 ;; +} + +L(ILOGB_COMMON): +// X NORMAL returns true exponent +{ .mmi + nop.m 999 +(p8) getf.exp r33 = f9 + nop.i 999 ;; +} + +// If denormal add 64 to exponent bias for scaling +{ .mfb +(p9) add r34 = 64, r34 + nop.f 999 +(p9) br.cond.spnt L(ILOGB_DENORM) ;; +} + +{ .mmi +(p8) and r36 = r35, r33 + nop.m 999 + nop.i 999 ;; +} + +{ .mib +(p8) sub r8 = r36, r34 + nop.i 999 +(p0) br.ret.sptk b0 ;; +} + +L(ILOGB_DENORM): +// Here if x denormal +// Form x * 2^64 which is normal +// Return to common code +{ .mfb + cmp.eq p8,p9 = r0,r0 + fmpy f9 = f9, f10 + br.cond.sptk L(ILOGB_COMMON) ;; +} + +// X ZERO +// return INT_MIN, call error support +L(ILOGB_ZERO): +{.mlx + mov GR_Parameter_TAG = 158 +(p6) movl r33 = 0x0000000080000000 ;; +};; +.endp ilogbf +ASM_SIZE_DIRECTIVE(ilogbf) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + mov r8 = r33 // Store result +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ilogbl.S b/sysdeps/ia64/fpu/s_ilogbl.S new file mode 100644 index 0000000000..4c67d49fe3 --- /dev/null +++ b/sysdeps/ia64/fpu/s_ilogbl.S @@ -0,0 +1,240 @@ +.file "ilogbl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/03/00 Initial version +// 5/26/00 Fix bug when x a double-extended denormal; +// if x=0 call error routine, per C9X +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 1/20/01 Fixed result for x=0 + +.align 32 +.global ilogbl# + +.section .text +.proc ilogbl# +.align 32 + +// API +//============================================================== +// int = ilogbl(double_extended) + +// Overview of operation +//============================================================== +// ilogbl computes log2(x) as an int +// and returns it in r8 + +// ilogbl is similar to logbl but differs in the following ways: +// +-inf +// ilogbl: returns INT_MAX +// logbl: returns +inf +// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// ilogbl: returns INT_MAX (7fffffff) +// logbl: returns QNAN (quieted SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogbl: returns INT_MIN (80000000) +// logbl: returns -inf + +// Registers used +//============================================================== + +// general local registers: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path + +// predicate registers used: +// p6 - x nan, inf +// p7 - x 0 +// p8 - x norm, unorm +// p9 - x unorm + +// floating-point registers used: +// f8 - f10 + +#include "libm_support.h" + +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f0 + + +ilogbl: + +// Form signexp of 2^64 in case need to scale denormal +{ .mmf + alloc r32=ar.pfs,1,5,4,0 +(p0) mov r37 = 0x1003f +(p0) fnorm f9 = f8 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f10 = r37 +(p0) fclass.m.unc p7, p8 = f8, 0xe3 +(p0) mov r34 = 0xffff ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +// X ZERO, returns INT_MIN +// X INF or NAN, returns INT_MAX + +{ .mfi +(p0) mov r35 = 0x1ffff +(p8) fclass.m.unc p6, p8 = f8, 0x07 + nop.i 999 ;; +} +{ .mlx + nop.m 999 +(p7) movl r8 = 0x000000007fffffff ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(ILOGB_ZERO) ;; +} + +// Test for denormal +{ .mfi + nop.m 999 +(p8) fclass.m.unc p9, p0 = f9, 0x0b + nop.i 999 ;; +} + +L(ILOGB_COMMON): +// X NORMAL returns true exponent +{ .mmi + nop.m 999 +(p8) getf.exp r33 = f9 + nop.i 999 ;; +} + +// If denormal add 64 to exponent bias for scaling +{ .mfb +(p9) add r34 = 64, r34 + nop.f 999 +(p9) br.cond.spnt L(ILOGB_DENORM) ;; +} + +{ .mmi +(p8) and r36 = r35, r33 + nop.m 999 + nop.i 999 ;; +} + +{ .mib +(p8) sub r8 = r36, r34 + nop.i 999 +(p0) br.ret.sptk b0 ;; +} + +L(ILOGB_DENORM): +// Here if x denormal +// Form x * 2^64 which is normal +// Return to common code +{ .mfb + cmp.eq p8,p9 = r0,r0 + fmpy f9 = f9, f10 + br.cond.sptk L(ILOGB_COMMON) ;; +} + +// X ZERO +// return INT_MIN, call error support +L(ILOGB_ZERO): +{.mlx + mov GR_Parameter_TAG = 156 +(p6) movl r33 = 0x0000000080000000 ;; +};; +.endp ilogbl +ASM_SIZE_DIRECTIVE(ilogbl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + mov r8 = r33 // Store result +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexp.S b/sysdeps/ia64/fpu/s_ldexp.S new file mode 100644 index 0000000000..73bd2f4ed3 --- /dev/null +++ b/sysdeps/ia64/fpu/s_ldexp.S @@ -0,0 +1,367 @@ +.file "ldexp.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 ldex pcompletely reworked and now standalone version +// +// API +//============================================================== +// double = ldexp (double x, int n) +// input floating point f8 and int n (r33) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global ldexp + +.section .text +.proc ldexp +.align 32 + +ldexp: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,1,2,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r33,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r33 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x00000000000303FF +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x00000000000103FF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 146, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 147, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(LDEXP_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(LDEXP_OVERFLOW) +(p9) br.cond.spnt L(LDEXP_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp ldexp +ASM_SIZE_DIRECTIVE(ldexp) +.proc __libm_error_region +__libm_error_region: + +L(LDEXP_OVERFLOW): +L(LDEXP_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfd [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfd FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexpf.S b/sysdeps/ia64/fpu/s_ldexpf.S new file mode 100644 index 0000000000..07f750d80d --- /dev/null +++ b/sysdeps/ia64/fpu/s_ldexpf.S @@ -0,0 +1,366 @@ +//.file "ldexpf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 ldexpf completely reworked and now standalone version +// +// API +//============================================================== +// float = ldexpf (float x, int n) +// input floating point f8 and int n (r33) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global ldexpf + +.section .text +.proc ldexpf +.align 32 + +ldexpf: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,1,2,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r33,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r33 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x000000000003007F +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x000000000001007F +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 148, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 149, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(ldexpf_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(ldexpf_OVERFLOW) +(p9) br.cond.spnt L(ldexpf_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp ldexpf +ASM_SIZE_DIRECTIVE(ldexpf) +.proc __libm_error_region +__libm_error_region: + +L(ldexpf_OVERFLOW): +L(ldexpf_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfs [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfs FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexpl.S b/sysdeps/ia64/fpu/s_ldexpl.S new file mode 100644 index 0000000000..d9983a501f --- /dev/null +++ b/sysdeps/ia64/fpu/s_ldexpl.S @@ -0,0 +1,366 @@ +//.file "ldexpl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 ldexpl completely reworked and now standalone version +// +// API +//============================================================== +// double-extended = ldexpl (double-extended x, int n) +// input floating point f8 and int n (r34) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global ldexpl + +.section .text +.proc ldexpl +.align 32 + +ldexpl: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,2,1,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r34,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r34 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x0000000000033FFF +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x0000000000013FFF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 144, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 145, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(ldexpl_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(ldexpl_OVERFLOW) +(p9) br.cond.spnt L(ldexpl_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp ldexpl +ASM_SIZE_DIRECTIVE(ldexpl) +.proc __libm_error_region +__libm_error_region: + +L(ldexpl_OVERFLOW): +L(ldexpl_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfe [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfe [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfe FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_log1p.S b/sysdeps/ia64/fpu/s_log1p.S new file mode 100644 index 0000000000..a49a183ce3 --- /dev/null +++ b/sysdeps/ia64/fpu/s_log1p.S @@ -0,0 +1,1614 @@ +.file "log1p.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: log1p(x) = ln(x+1), for double precision x values +// +// ********************************************************************* +// +// Accuracy: Very accurate for double precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f33-f55,f99 +// +// General Purpose Registers: +// r32-r53 +// r54-r57 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions cannot occur +// Underflow exceptions raised when appropriate for log1p +// (Error Handling Routine called for underflow) +// Inexact raised when appropriate by algorithm +// +// log1p(inf) = inf +// log1p(-inf) = QNaN +// log1p(+/-0) = +/-0 +// log1p(-1) = -inf +// log1p(SNaN) = QNaN +// log1p(QNaN) = QNaN +// log1p(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Computation is based on the following kernel. +// +// ker_log_64( in_FR : X, +// in_FR : E, +// in_FR : Em1, +// in_GR : Expo_Range, +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : Scale, +// out_PR : Safe ) +// +// Overview +// +// The method consists of three cases. +// +// If |X+Em1| < 2^(-80) use case log1p_small; +// elseif |X+Em1| < 2^(-7) use case log_near1; +// else use case log_regular; +// +// Case log1p_small: +// +// log( 1 + (X+Em1) ) can be approximated by (X+Em1). +// +// Case log_near1: +// +// log( 1 + (X+Em1) ) can be approximated by a simple polynomial +// in W = X+Em1. This polynomial resembles the truncated Taylor +// series W - W^/2 + W^3/3 - ... +// +// Case log_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute log(Arg) for an argument Arg in [1,2), we +// construct a value G such that G*Arg is close to 1 and that +// log(1/G) is obtainable easily from a table of values calculated +// beforehand. Thus +// +// log(Arg) = log(1/G) + log(G*Arg) +// = log(1/G) + log(1 + (G*Arg - 1)) +// +// Because |G*Arg - 1| is small, the second term on the right hand +// side can be approximated by a short polynomial. We elaborate +// this method in four steps. +// +// Step 0: Initialization +// +// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that +// +// E + X = 2^N * ( S_hi + S_lo ) exactly +// +// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense +// that |S_lo| <= ulp(S_hi). +// +// Step 1: Argument Reduction +// +// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate +// +// G := G_1 * G_2 * G_3 +// r := (G * S_hi - 1) + G * S_lo +// +// These G_j's have the property that the product is exactly +// representable and that |r| < 2^(-12) as a result. +// +// Step 2: Approximation +// +// +// log(1 + r) is approximated by a short polynomial poly(r). +// +// Step 3: Reconstruction +// +// +// Finally, log( E + X ) is given by +// +// log( E + X ) = log( 2^N * (S_hi + S_lo) ) +// ~=~ N*log(2) + log(1/G) + log(1 + r) +// ~=~ N*log(2) + log(1/G) + poly(r). +// +// **** Algorithm **** +// +// Case log1p_small: +// +// Although log(1 + (X+Em1)) is basically X+Em1, we would like to +// preserve the inexactness nature as well as consistent behavior +// under different rounding modes. Note that this case can only be +// taken if E is set to be 1.0. In this case, Em1 is zero, and that +// X can be very tiny and thus the final result can possibly underflow. +// Thus, we compare X against a threshold that is dependent on the +// input Expo_Range. If |X| is smaller than this threshold, we set +// SAFE to be FALSE. +// +// The result is returned as Y_hi, Y_lo, and in the case of SAFE +// is FALSE, an additional value Scale is also returned. +// +// W := X + Em1 +// Threshold := Threshold_Table( Expo_Range ) +// Tiny := Tiny_Table( Expo_Range ) +// +// If ( |W| > Threshold ) then +// Y_hi := W +// Y_lo := -W*W +// Else +// Y_hi := W +// Y_lo := -Tiny +// Scale := 2^(-100) +// Safe := FALSE +// EndIf +// +// +// One may think that Y_lo should be -W*W/2; however, it does not matter +// as Y_lo will be rounded off completely except for the correct effect in +// directed rounding. Clearly -W*W is simplier to compute. Moreover, +// because of the difference in exponent value, Y_hi + Y_lo or +// Y_hi + Scale*Y_lo is always inexact. +// +// Case log_near1: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into two portions. +// +// W := X + Em1 +// Wsq := W * W +// W4 := Wsq*Wsq +// W6 := W4*Wsq +// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) +// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) +// set lsb(Y_lo) to be 1 +// +// Case log_regular: +// +// We present the algorithm in four steps. +// +// Step 0. Initialization +// ---------------------- +// +// Z := X + E +// N := unbaised exponent of Z +// S_hi := 2^(-N) * Z +// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } +// +// Note that S_lo is always 0 for the case E = 0. +// +// Step 1. Argument Reduction +// -------------------------- +// +// Let +// +// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 +// +// We obtain G_1, G_2, G_3 by the following steps. +// +// +// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted +// from S_hi. +// +// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated +// to lsb = 2^(-4). +// +// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// +// Fetch Z_1 := (1/A_1) rounded UP in fixed point with +// fixed point lsb = 2^(-15). +// Z_1 looks like z_0.z_1 z_2 ... z_15 +// Note that the fetching is done using index_1. +// A_1 is actually not needed in the implementation +// and is used here only to explain how is the value +// Z_1 defined. +// +// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. +// floating pt. Again, fetching is done using index_1. A_1 +// explains how G_1 is defined. +// +// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 d_5 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_1 indeed always begin +// with 1.0000 in fixed point. +// +// +// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 +// truncated to lsb = 2^(-8). Similar to A_1, +// A_2 is not needed in actual implementation. It +// helps explain how some of the values are defined. +// +// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// +// Fetch Z_2 := (1/A_2) rounded UP in fixed point with +// fixed point lsb = 2^(-15). Fetch done using index_2. +// Z_2 looks like z_0.z_1 z_2 ... z_15 +// +// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. +// floating pt. +// +// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_2 indeed always begin +// with 1.00000000 in fixed point. +// +// +// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. +// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// +// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// +// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. +// floating pt. Fetch is done using index_3. +// +// Compute G := G_1 * G_2 * G_3. +// +// This is done exactly since each of G_j only has 21 sig. bits. +// +// Compute +// +// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// +// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of +// rounding errors. +// +// +// Step 2. Approximation +// --------------------- +// +// This step computes an approximation to log( 1 + r ) where r is the +// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); +// thus log(1+r) can be approximated by a short polynomial: +// +// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// +// +// Step 3. Reconstruction +// ---------------------- +// +// This step computes the desired result of log(X+E): +// +// log(X+E) = log( 2^N * (S_hi + S_lo) ) +// = N*log(2) + log( S_hi + S_lo ) +// = N*log(2) + log(1/G) + +// log(1 + C*(S_hi+S_lo) - 1 ) +// +// log(2), log(1/G_j) are stored as pairs of (single,double) numbers: +// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are +// single-precision numbers and the low parts are double precision +// numbers. These have the property that +// +// N*log2_hi + SUM ( log1byGj_hi ) +// +// is computable exactly in double-extended precision (64 sig. bits). +// Finally +// +// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) +// Y_lo := poly_hi + [ poly_lo + +// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] +// set lsb(Y_lo) to be 1 +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 + +.align 64 +Constants_P: +ASM_TYPE_DIRECTIVE(Constants_P,@object) +data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 +data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 +data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 +data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 +data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 +data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 +data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_P) + +// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 + +.align 64 +Constants_Q: +ASM_TYPE_DIRECTIVE(Constants_Q,@object) +data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 +data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 +data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 +data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 +data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Q) + +// Z1 - 16 bit fixed, G1 and H1 - IEEE single + +.align 64 +Constants_Z_G_H_h1: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 +data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 +data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF +data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 +data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 +data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) + +// Z2 - 16 bit fixed, G2 and H2 - IEEE single + +.align 64 +Constants_Z_G_H_h2: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 +data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) + +// G3 and H3 - IEEE single and h3 -IEEE double + +.align 64 +Constants_Z_G_H_h3: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) +data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 +data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 +data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D +data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 +data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 +data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 +data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 +data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 +data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E +data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D +data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 +data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 +data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC +data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 +data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B +data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B +data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 +data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 +data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 +data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC +data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 +data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 +data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 +data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B +data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E +data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 +data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 +data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 +data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B +data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 +data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 +data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) + +// +// Exponent Thresholds and Tiny Thresholds +// for 8, 11, 15, and 17 bit exponents +// +// Expo_Range Value +// +// 0 (8 bits) 2^(-126) +// 1 (11 bits) 2^(-1022) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// +// Tiny_Table +// ---------- +// Expo_Range Value +// +// 0 (8 bits) 2^(-16382) +// 1 (11 bits) 2^(-16382) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// + +.align 64 +Constants_Threshold: +ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) +data4 0x00000000,0x80000000,0x00003F81,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00003C01,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Threshold) + +.align 64 +Constants_1_by_LN10: +ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) +data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 +data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) + +FR_Input_X = f8 +FR_Neg_One = f9 +FR_E = f33 +FR_Em1 = f34 +FR_Y_hi = f34 +// Shared with Em1 +FR_Y_lo = f35 +FR_Scale = f36 +FR_X_Prime = f37 +FR_Z = f38 +FR_S_hi = f38 +// Shared with Z +FR_W = f39 +FR_G = f40 +FR_wsq = f40 +// Shared with G +FR_H = f41 +FR_w4 = f41 +// Shared with H +FR_h = f42 +FR_w6 = f42 +// Shared with h +FR_G_tmp = f43 +FR_poly_lo = f43 +// Shared with G_tmp +FR_P8 = f43 +// Shared with G_tmp +FR_H_tmp = f44 +FR_poly_hi = f44 + // Shared with H_tmp +FR_P7 = f44 +// Shared with H_tmp +FR_h_tmp = f45 +FR_rsq = f45 +// Shared with h_tmp +FR_P6 = f45 +// Shared with h_tmp +FR_abs_W = f46 +FR_r = f46 +// Shared with abs_W +FR_AA = f47 +FR_log2_hi = f47 +// Shared with AA +FR_BB = f48 +FR_log2_lo = f48 +// Shared with BB +FR_S_lo = f49 +FR_two_negN = f50 +FR_float_N = f51 +FR_Q4 = f52 +FR_dummy = f52 +// Shared with Q4 +FR_P4 = f52 +// Shared with Q4 +FR_Threshold = f52 +// Shared with Q4 +FR_Q3 = f53 +FR_P3 = f53 +// Shared with Q3 +FR_Tiny = f53 +// Shared with Q3 +FR_Q2 = f54 +FR_P2 = f54 +// Shared with Q2 +FR_1LN10_hi = f54 +// Shared with Q2 +FR_Q1 = f55 +FR_P1 = f55 +// Shared with Q1 +FR_1LN10_lo = f55 +// Shared with Q1 +FR_P5 = f98 +FR_SCALE = f98 +FR_Output_X_tmp = f99 + +GR_Expo_Range = r32 +GR_Table_Base = r34 +GR_Table_Base1 = r35 +GR_Table_ptr = r36 +GR_Index2 = r37 +GR_signif = r38 +GR_X_0 = r39 +GR_X_1 = r40 +GR_X_2 = r41 +GR_Z_1 = r42 +GR_Z_2 = r43 +GR_N = r44 +GR_Bias = r45 +GR_M = r46 +GR_ScaleN = r47 +GR_Index3 = r48 +GR_Perturb = r49 +GR_Table_Scale = r50 + + +GR_SAVE_PFS = r51 +GR_SAVE_B0 = r52 +GR_SAVE_GP = r53 + +GR_Parameter_X = r54 +GR_Parameter_Y = r55 +GR_Parameter_RESULT = r56 + +GR_Parameter_TAG = r57 + + +.section .text +.proc log1p# +.global log1p# +.align 64 +log1p: +#ifdef _LIBC +.global __log1p +__log1p: +#endif + +{ .mfi +alloc r32 = ar.pfs,0,22,4,0 +(p0) fsub.s1 FR_Neg_One = f0,f1 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} + +{ .mfi +(p0) cmp.ne.unc p14, p0 = r0, r0 +(p0) fnorm.s1 FR_X_Prime = FR_Input_X +(p0) cmp.eq.unc p15, p0 = r0, r0 ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fadd FR_Em1 = f0,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd FR_E = f0,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} + + +L(LOG_BEGIN): + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl GR_Table_Scale = 0x0000000000000018 ;; +} + +{ .mmi + nop.m 999 +// +// Create E = 1 and Em1 = 0 +// Check for X == 0, meaning log(1+0) +// Check for X < -1, meaning log(negative) +// Check for X == -1, meaning log(0) +// Normalize x +// Identify NatVals, NaNs, Infs. +// Identify EM unsupporteds. +// Identify Negative values - us S1 so as +// not to raise denormal operand exception +// Set p15 to true for log1p +// Set p14 to false for log1p +// Set p7 true for log and log1p +// +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E + nop.i 999 ;; +} + +{ .mfi + ld8 GR_Table_Base = [GR_Table_Base] +(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E + nop.i 999 +} + +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 +// +// Begin load of constants base +// FR_Z = Z = |x| + E +// FR_W = W = |x| + Em1 +// AA = fmax(|x|,E) +// BB = fmin(|x|,E) +// +(p6) br.cond.spnt L(LOG_64_special) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt L(LOG_64_unsupported) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.spnt L(LOG_64_negative) ;; +} + +{ .mib +(p0) getf.sig GR_signif = FR_Z + nop.i 999 +(p9) br.cond.spnt L(LOG_64_one) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(LOG_64_zero) ;; +} + +{ .mfi +(p0) getf.exp GR_N = FR_Z +// +// Raise possible denormal operand exception +// Create Bias +// +// This function computes ln( x + e ) +// Input FR 1: FR_X = FR_Input_X +// Input FR 2: FR_E = FR_E +// Input FR 3: FR_Em1 = FR_Em1 +// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 +// Output FR 4: FR_Y_hi +// Output FR 5: FR_Y_lo +// Output FR 6: FR_Scale +// Output PR 7: PR_Safe +// +(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z +// +// signif = getf.sig(Z) +// abs_W = fabs(w) +// +(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.se FR_S_hi = f1,FR_Z +(p0) extr.u GR_X_0 = GR_signif, 49, 15 +} + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp + nop.i 999 +} +;; + +{ .mlx + ld8 GR_Table_Base1 = [GR_Table_Base1] +(p0) movl GR_Bias = 0x000000000000FFFF ;; +} + +{ .mfi + nop.m 999 +(p0) fabs FR_abs_W = FR_W +(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 +} + +{ .mfi + nop.m 999 +// +// Branch out for special input values +// +(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// X_0 = extr.u(signif,49,15) +// Index1 = extr.u(signif,59,4) +// +(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Offset_to_Z1 = 24 * Index1 +// For performance, don't use result +// for 3 or 4 cycles. +// +(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; +} +// +// Add Base to Offset for Z1 +// Create Bias + +{ .mmi +(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; +(p0) ldfs FR_G = [GR_Table_ptr],4 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfs FR_H = [GR_Table_ptr],8 ;; +(p0) ldfd FR_h = [GR_Table_ptr],0 +(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 +} +// +// Load Z_1 +// Get Base of Table2 +// + +{ .mfi +(p0) getf.exp GR_M = FR_abs_W + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// M = getf.exp(abs_W) +// S_lo = AA - Z +// X_1 = pmpyshr2(X_0,Z_1,15) +// +(p0) sub GR_M = GR_M, GR_Bias ;; +} +// +// M = M - Bias +// Load G1 +// N = getf.exp(Z) +// + +{ .mii +(p0) cmp.gt.unc p11, p0 = -80, GR_M +(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; +(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +} + +{ .mib + nop.m 999 +// +// if -80 > M, set p11 +// Index2 = extr.u(X_1,6,4) +// if -7 > M, set p12 +// Load H1 +// +(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 +(p11) br.cond.spnt L(log1p_small) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.spnt L(log1p_near) ;; +} + +{ .mii +(p0) sub GR_N = GR_N, GR_Bias +// +// poly_lo = r * poly_lo +// +(p0) add GR_Perturb = 0x1, r0 ;; +(p0) sub GR_ScaleN = GR_Bias, GR_N +} + +{ .mii +(p0) setf.sig FR_float_N = GR_N + nop.i 999 ;; +// +// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) +// Load h1 +// S_lo = S_lo + BB +// Branch for -80 > M +// +(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 +} + +{ .mmi +(p0) setf.exp FR_two_negN = GR_ScaleN + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp +};; + +// +// Index2 points to Z2 +// Branch for -7 > M +// + +{ .mmb +(p0) ld4 GR_Z_2 = [GR_Index2],4 + ld8 GR_Table_Base = [GR_Table_Base] + nop.b 999 ;; +} +(p0) nop.i 999 +// +// Load Z_2 +// N = N - Bias +// Tablebase points to Table3 +// + +{ .mmi +(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; +// +// Load G_2 +// pmpyshr2 X_2= (X_1,Z_2,15) +// float_N = setf.sig(N) +// ScaleN = Bias - N +// +(p0) ldfs FR_H_tmp = [GR_Index2],8 + nop.i 999 ;; +} +// +// Load H_2 +// two_negN = setf.exp(scaleN) +// G = G_1 * G_2 +// + +{ .mfi +(p0) ldfd FR_h_tmp = [GR_Index2],0 + nop.f 999 +(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; +} + +{ .mii + nop.m 999 +(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +// +// Load h_2 +// H = H_1 + H_2 +// h = h_1 + h_2 +// Index3 = extr.u(X_2,1,5) +// +(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base +} + +{ .mmi + nop.m 999 + nop.m 999 +// +// float_N = fcvt.xf(float_N) +// load G3 +// +(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; +} + +{ .mfi +ld8 GR_Table_Base = [GR_Table_Base] +nop.f 999 +nop.i 999 +} ;; + +{ .mfi +(p0) ldfe FR_log2_hi = [GR_Table_Base],16 +(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN + nop.i 999 ;; +} + +{ .mmf + nop.m 999 +// +// G = G3 * G +// Load h3 +// Load log2_hi +// H = H + H3 +// +(p0) ldfe FR_log2_lo = [GR_Table_Base],16 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; +} + +{ .mmf +(p0) ldfs FR_G_tmp = [GR_Index3],4 +// +// h = h + h3 +// r = G * S_hi + 1 +// Load log2_lo +// +(p0) ldfe FR_Q4 = [GR_Table_Base],16 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; +} + +{ .mfi +(p0) ldfe FR_Q3 = [GR_Table_Base],16 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} + +{ .mmf +(p0) ldfs FR_H_tmp = [GR_Index3],4 +(p0) ldfe FR_Q2 = [GR_Table_Base],16 +// +// Comput Index for Table3 +// S_lo = S_lo * two_negN +// +(p0) fcvt.xf FR_float_N = FR_float_N ;; +} +// +// If S_lo == 0, set p8 false +// Load H3 +// Load ptr to table of polynomial coeff. +// + +{ .mmf +(p0) ldfd FR_h_tmp = [GR_Index3],0 +(p0) ldfe FR_Q1 = [GR_Table_Base],0 +(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load Q4 +// Load Q3 +// Load Q2 +// Load Q1 +// +(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// poly_lo = r * Q4 + Q3 +// rsq = r* r +// +(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// If (S_lo!=0) r = s_lo * G + r +// +(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 + nop.i 999 +} +// +// Create a 0x00000....01 +// poly_lo = poly_lo * rsq + h +// + +{ .mfi +(p0) setf.sig FR_dummy = GR_Perturb +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// h = N * log2_lo + h +// Y_hi = n * log2_hi + H +// +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly_lo = r * poly_o + Q2 +// poly_hi = Q1 * rsq + r +// +(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo +// +// Create the FR for a binary "or" +// Y_lo = poly_hi + poly_lo +// +// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; +// +// Turn the lsb of Y_lo ON +// +// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; +// +// Merge the new lsb into Y_lo, for alone doesn't +// +(p0) br.cond.sptk L(LOG_main) ;; +} + + +L(log1p_near): + +{ .mmi + nop.m 999 + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1p_near ************************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; +} +// +// Load base address of poly. coeff. +// +{.mmi + nop.m 999 + ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mmb +(p0) add GR_Table_ptr = 0x40,GR_Table_Base +// +// Address tables with separate pointers +// +(p0) ldfe FR_P8 = [GR_Table_Base],16 + nop.b 999 ;; +} + +{ .mmb +(p0) ldfe FR_P4 = [GR_Table_ptr],16 +// +// Load P4 +// Load P8 +// +(p0) ldfe FR_P7 = [GR_Table_Base],16 + nop.b 999 ;; +} + +{ .mmf +(p0) ldfe FR_P3 = [GR_Table_ptr],16 +// +// Load P3 +// Load P7 +// +(p0) ldfe FR_P6 = [GR_Table_Base],16 +(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; +} + +{ .mfi +(p0) ldfe FR_P2 = [GR_Table_ptr],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 + nop.i 999 +} +// +// Load P2 +// Load P6 +// Wsq = w * w +// Y_hi = p4 * w + p3 +// + +{ .mfi +(p0) ldfe FR_P5 = [GR_Table_Base],16 +(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe FR_P1 = [GR_Table_ptr],16 +// +// Load P1 +// Load P5 +// Y_lo = p8 * w + P7 +// +(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 +(p0) add GR_Perturb = 0x1, r0 ;; +} + +{ .mfi + nop.m 999 +// +// w4 = w2 * w2 +// Y_hi = y_hi * w + p2 +// Y_lo = y_lo * w + p6 +// Create perturbation bit +// +(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 + nop.i 999 +} +// +// Y_hi = y_hi * w + p1 +// w6 = w4 * w2 +// + +{ .mfi +(p0) setf.sig FR_Q4 = GR_Perturb +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// Y_hi = y_hi * wsq + w +// Y_lo = y_lo * w + p5 +// +(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo +// +// Y_lo = y_lo * w6 +// +// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; +// +// Set lsb on: Taken out to improve performance +// +// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; +// +// Make sure it's on in Y_lo also. Taken out to improve +// performance +// +(p0) br.cond.sptk L(LOG_main) ;; +} + + +L(log1p_small): + +{ .mmi + nop.m 999 + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1p_small ***********************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp +} + +{ .mfi + nop.m 999 +(p0) mov FR_Em1 = FR_W +(p0) cmp.eq.unc p7, p0 = r0, r0 ;; +} + +{ .mlx + ld8 GR_Table_Base = [GR_Table_Base] +(p0) movl GR_Expo_Range = 0x0000000000000002 ;; +} +// +// Set Safe to true +// Set Expo_Range = 0 for single +// Set Expo_Range = 2 for double +// Set Expo_Range = 4 for double-extended +// + +{ .mmi +(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; +(p0) ldfe FR_Threshold = [GR_Table_Base],16 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl GR_Bias = 0x000000000000FF9B ;; +} + +{ .mfi +(p0) ldfe FR_Tiny = [GR_Table_Base],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W + nop.i 999 +} + +{ .mfi + nop.m 999 +(p13) fadd FR_SCALE = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny +(p12) cmp.ne.unc p7, p0 = r0, r0 +} + +{ .mfi +(p12) setf.exp FR_SCALE = GR_Bias + nop.f 999 + nop.i 999 ;; +} + +// +// Set p7 to SAFE = FALSE +// Set Scale = 2^-100 +// +{ .mfb + nop.m 999 +(p0) fma.d.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi +(p0) br.ret.sptk b0 +} +;; + +L(LOG_64_one): + +{ .mfb + nop.m 999 +(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0 +(p0) br.ret.sptk b0 +} +;; + +// +// Raise divide by zero for +/-0 input. +// +L(LOG_64_zero): + +{ .mfi +(p0) mov GR_Parameter_TAG = 140 +// +// If we have log1p(0), return -Inf. +// +(p0) fsub.s0 FR_Output_X_tmp = f0, f1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 +(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +} + +L(LOG_64_special): + +{ .mfi + nop.m 999 +// +// Return -Inf or value from handler. +// +(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Check for Natval, QNan, SNaN, +Inf +// +(p7) fmpy.d.s0 f8 = FR_Input_X, f1 +// +// For SNaN raise invalid and return QNaN. +// For QNaN raise invalid and return QNaN. +// For +Inf return +Inf. +// +(p7) br.ret.sptk b0 +} +;; + +// +// For -Inf raise invalid and return QNaN. +// + +{ .mfb +(p0) mov GR_Parameter_TAG = 141 +(p0) fmpy.d.s0 FR_Output_X_tmp = FR_Input_X, f0 +(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +} + +// +// Report that log1p(-Inf) computed +// + +L(LOG_64_unsupported): + +// +// Return generated NaN or other value . +// + +{ .mfb + nop.m 999 +(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0 +(p0) br.ret.sptk b0 ;; +} + +L(LOG_64_negative): + +{ .mfi + nop.m 999 +// +// Deal with x < 0 in a special way +// +(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 +// +// Deal with x < 0 in a special way - raise +// invalid and produce QNaN indefinite. +// +(p0) mov GR_Parameter_TAG = 141 +} + +.endp log1p# +ASM_SIZE_DIRECTIVE(log1p) + +.proc __libm_error_region +__libm_error_region: +L(LOG_ERROR_Support): +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.proc __libm_LOG_main +__libm_LOG_main: +L(LOG_main): + +// +// kernel_log_64 computes ln(X + E) +// + +{ .mfi + nop.m 999 +(p7) fadd.d.s0 FR_Input_X = FR_Y_lo,FR_Y_hi + nop.i 999 +} + +{ .mmi + nop.m 999 + nop.m 999 +(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; +} + +{ .mmi + nop.m 999 +(p14) ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mmi +(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; +(p14) ldfe FR_1LN10_lo = [GR_Table_Base] + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p14) fma.d.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp +(p0) br.ret.sptk b0 ;; +} +.endp __libm_LOG_main +ASM_SIZE_DIRECTIVE(__libm_LOG_main) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_log1pf.S b/sysdeps/ia64/fpu/s_log1pf.S new file mode 100644 index 0000000000..7f21ccac82 --- /dev/null +++ b/sysdeps/ia64/fpu/s_log1pf.S @@ -0,0 +1,1616 @@ +.file "log1pf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// Function: log1pf(x) = ln(x+1), for single precision values +// +// ********************************************************************* +// +// Accuracy: Very accurate for single precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f33-f55,f99 +// +// General Purpose Registers: +// r32-r53 +// r54-r57 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions cannot occur +// Underflow exceptions raised when appropriate for log1pf +// (Error Handling Routine called for underflow) +// Inexact raised when appropriate by algorithm +// +// log1pf(inf) = inf +// log1pf(-inf) = QNaN +// log1pf(+/-0) = +/-0 +// log1pf(-1) = -inf +// log1pf(SNaN) = QNaN +// log1pf(QNaN) = QNaN +// log1pf(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Computation is based on the following kernel. +// +// ker_log_64( in_FR : X, +// in_FR : E, +// in_FR : Em1, +// in_GR : Expo_Range, +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : Scale, +// out_PR : Safe ) +// +// Overview +// +// The method consists of three cases. +// +// If |X+Em1| < 2^(-80) use case log1pf_small; +// elseif |X+Em1| < 2^(-7) use case log_near1; +// else use case log_regular; +// +// Case log1pf_small: +// +// log( 1 + (X+Em1) ) can be approximated by (X+Em1). +// +// Case log_near1: +// +// log( 1 + (X+Em1) ) can be approximated by a simple polynomial +// in W = X+Em1. This polynomial resembles the truncated Taylor +// series W - W^/2 + W^3/3 - ... +// +// Case log_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute log(Arg) for an argument Arg in [1,2), we +// construct a value G such that G*Arg is close to 1 and that +// log(1/G) is obtainable easily from a table of values calculated +// beforehand. Thus +// +// log(Arg) = log(1/G) + log(G*Arg) +// = log(1/G) + log(1 + (G*Arg - 1)) +// +// Because |G*Arg - 1| is small, the second term on the right hand +// side can be approximated by a short polynomial. We elaborate +// this method in four steps. +// +// Step 0: Initialization +// +// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that +// +// E + X = 2^N * ( S_hi + S_lo ) exactly +// +// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense +// that |S_lo| <= ulp(S_hi). +// +// Step 1: Argument Reduction +// +// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate +// +// G := G_1 * G_2 * G_3 +// r := (G * S_hi - 1) + G * S_lo +// +// These G_j's have the property that the product is exactly +// representable and that |r| < 2^(-12) as a result. +// +// Step 2: Approximation +// +// +// log(1 + r) is approximated by a short polynomial poly(r). +// +// Step 3: Reconstruction +// +// +// Finally, log( E + X ) is given by +// +// log( E + X ) = log( 2^N * (S_hi + S_lo) ) +// ~=~ N*log(2) + log(1/G) + log(1 + r) +// ~=~ N*log(2) + log(1/G) + poly(r). +// +// **** Algorithm **** +// +// Case log1pf_small: +// +// Although log(1 + (X+Em1)) is basically X+Em1, we would like to +// preserve the inexactness nature as well as consistent behavior +// under different rounding modes. Note that this case can only be +// taken if E is set to be 1.0. In this case, Em1 is zero, and that +// X can be very tiny and thus the final result can possibly underflow. +// Thus, we compare X against a threshold that is dependent on the +// input Expo_Range. If |X| is smaller than this threshold, we set +// SAFE to be FALSE. +// +// The result is returned as Y_hi, Y_lo, and in the case of SAFE +// is FALSE, an additional value Scale is also returned. +// +// W := X + Em1 +// Threshold := Threshold_Table( Expo_Range ) +// Tiny := Tiny_Table( Expo_Range ) +// +// If ( |W| > Threshold ) then +// Y_hi := W +// Y_lo := -W*W +// Else +// Y_hi := W +// Y_lo := -Tiny +// Scale := 2^(-100) +// Safe := FALSE +// EndIf +// +// +// One may think that Y_lo should be -W*W/2; however, it does not matter +// as Y_lo will be rounded off completely except for the correct effect in +// directed rounding. Clearly -W*W is simplier to compute. Moreover, +// because of the difference in exponent value, Y_hi + Y_lo or +// Y_hi + Scale*Y_lo is always inexact. +// +// Case log_near1: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into two portions. +// +// W := X + Em1 +// Wsq := W * W +// W4 := Wsq*Wsq +// W6 := W4*Wsq +// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) +// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) +// set lsb(Y_lo) to be 1 +// +// Case log_regular: +// +// We present the algorithm in four steps. +// +// Step 0. Initialization +// ---------------------- +// +// Z := X + E +// N := unbaised exponent of Z +// S_hi := 2^(-N) * Z +// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } +// +// Note that S_lo is always 0 for the case E = 0. +// +// Step 1. Argument Reduction +// -------------------------- +// +// Let +// +// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 +// +// We obtain G_1, G_2, G_3 by the following steps. +// +// +// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted +// from S_hi. +// +// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated +// to lsb = 2^(-4). +// +// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// +// Fetch Z_1 := (1/A_1) rounded UP in fixed point with +// fixed point lsb = 2^(-15). +// Z_1 looks like z_0.z_1 z_2 ... z_15 +// Note that the fetching is done using index_1. +// A_1 is actually not needed in the implementation +// and is used here only to explain how is the value +// Z_1 defined. +// +// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. +// floating pt. Again, fetching is done using index_1. A_1 +// explains how G_1 is defined. +// +// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 d_5 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_1 indeed always begin +// with 1.0000 in fixed point. +// +// +// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 +// truncated to lsb = 2^(-8). Similar to A_1, +// A_2 is not needed in actual implementation. It +// helps explain how some of the values are defined. +// +// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// +// Fetch Z_2 := (1/A_2) rounded UP in fixed point with +// fixed point lsb = 2^(-15). Fetch done using index_2. +// Z_2 looks like z_0.z_1 z_2 ... z_15 +// +// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. +// floating pt. +// +// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_2 indeed always begin +// with 1.00000000 in fixed point. +// +// +// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. +// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// +// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// +// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. +// floating pt. Fetch is done using index_3. +// +// Compute G := G_1 * G_2 * G_3. +// +// This is done exactly since each of G_j only has 21 sig. bits. +// +// Compute +// +// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// +// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of +// rounding errors. +// +// +// Step 2. Approximation +// --------------------- +// +// This step computes an approximation to log( 1 + r ) where r is the +// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); +// thus log(1+r) can be approximated by a short polynomial: +// +// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// +// +// Step 3. Reconstruction +// ---------------------- +// +// This step computes the desired result of log(X+E): +// +// log(X+E) = log( 2^N * (S_hi + S_lo) ) +// = N*log(2) + log( S_hi + S_lo ) +// = N*log(2) + log(1/G) + +// log(1 + C*(S_hi+S_lo) - 1 ) +// +// log(2), log(1/G_j) are stored as pairs of (single,double) numbers: +// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are +// single-precision numbers and the low parts are double precision +// numbers. These have the property that +// +// N*log2_hi + SUM ( log1byGj_hi ) +// +// is computable exactly in double-extended precision (64 sig. bits). +// Finally +// +// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) +// Y_lo := poly_hi + [ poly_lo + +// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] +// set lsb(Y_lo) to be 1 +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 + +.align 64 +Constants_P: +ASM_TYPE_DIRECTIVE(Constants_P,@object) +data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 +data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 +data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 +data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 +data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 +data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 +data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_P) + +// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 + +.align 64 +Constants_Q: +ASM_TYPE_DIRECTIVE(Constants_Q,@object) +data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 +data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 +data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 +data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 +data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Q) + +// Z1 - 16 bit fixed, G1 and H1 - IEEE single + +.align 64 +Constants_Z_G_H_h1: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 +data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 +data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF +data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 +data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 +data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) + +// Z2 - 16 bit fixed, G2 and H2 - IEEE single + +.align 64 +Constants_Z_G_H_h2: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 +data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) + +// G3 and H3 - IEEE single and h3 -IEEE double + +.align 64 +Constants_Z_G_H_h3: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) +data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 +data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 +data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D +data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 +data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 +data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 +data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 +data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 +data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E +data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D +data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 +data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 +data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC +data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 +data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B +data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B +data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 +data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 +data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 +data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC +data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 +data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 +data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 +data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B +data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E +data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 +data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 +data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 +data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B +data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 +data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 +data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) + +// +// Exponent Thresholds and Tiny Thresholds +// for 8, 11, 15, and 17 bit exponents +// +// Expo_Range Value +// +// 0 (8 bits) 2^(-126) +// 1 (11 bits) 2^(-1022) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// +// Tiny_Table +// ---------- +// Expo_Range Value +// +// 0 (8 bits) 2^(-16382) +// 1 (11 bits) 2^(-16382) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// + +.align 64 +Constants_Threshold: +ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) +data4 0x00000000,0x80000000,0x00003F81,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00003C01,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Threshold) + +.align 64 +Constants_1_by_LN10: +ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) +data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 +data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) + +FR_Input_X = f8 +FR_Neg_One = f9 +FR_E = f33 +FR_Em1 = f34 +FR_Y_hi = f34 +// Shared with Em1 +FR_Y_lo = f35 +FR_Scale = f36 +FR_X_Prime = f37 +FR_Z = f38 +FR_S_hi = f38 +// Shared with Z +FR_W = f39 +FR_G = f40 +FR_wsq = f40 +// Shared with G +FR_H = f41 +FR_w4 = f41 +// Shared with H +FR_h = f42 +FR_w6 = f42 +// Shared with h +FR_G_tmp = f43 +FR_poly_lo = f43 +// Shared with G_tmp +FR_P8 = f43 +// Shared with G_tmp +FR_H_tmp = f44 +FR_poly_hi = f44 + // Shared with H_tmp +FR_P7 = f44 +// Shared with H_tmp +FR_h_tmp = f45 +FR_rsq = f45 +// Shared with h_tmp +FR_P6 = f45 +// Shared with h_tmp +FR_abs_W = f46 +FR_r = f46 +// Shared with abs_W +FR_AA = f47 +FR_log2_hi = f47 +// Shared with AA +FR_BB = f48 +FR_log2_lo = f48 +// Shared with BB +FR_S_lo = f49 +FR_two_negN = f50 +FR_float_N = f51 +FR_Q4 = f52 +FR_dummy = f52 +// Shared with Q4 +FR_P4 = f52 +// Shared with Q4 +FR_Threshold = f52 +// Shared with Q4 +FR_Q3 = f53 +FR_P3 = f53 +// Shared with Q3 +FR_Tiny = f53 +// Shared with Q3 +FR_Q2 = f54 +FR_P2 = f54 +// Shared with Q2 +FR_1LN10_hi = f54 +// Shared with Q2 +FR_Q1 = f55 +FR_P1 = f55 +// Shared with Q1 +FR_1LN10_lo = f55 +// Shared with Q1 +FR_P5 = f98 +FR_SCALE = f98 +FR_Output_X_tmp = f99 + +GR_Expo_Range = r32 +GR_Table_Base = r34 +GR_Table_Base1 = r35 +GR_Table_ptr = r36 +GR_Index2 = r37 +GR_signif = r38 +GR_X_0 = r39 +GR_X_1 = r40 +GR_X_2 = r41 +GR_Z_1 = r42 +GR_Z_2 = r43 +GR_N = r44 +GR_Bias = r45 +GR_M = r46 +GR_ScaleN = r47 +GR_Index3 = r48 +GR_Perturb = r49 +GR_Table_Scale = r50 + + +GR_SAVE_PFS = r51 +GR_SAVE_B0 = r52 +GR_SAVE_GP = r53 + +GR_Parameter_X = r54 +GR_Parameter_Y = r55 +GR_Parameter_RESULT = r56 + +GR_Parameter_TAG = r57 + + +.section .text +.proc log1pf# +.global log1pf# +.align 64 +log1pf: +#ifdef _LIBC +.global __log1pf +__log1pf: +#endif + +{ .mfi +alloc r32 = ar.pfs,0,22,4,0 +(p0) fsub.s1 FR_Neg_One = f0,f1 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} + +{ .mfi +(p0) cmp.ne.unc p14, p0 = r0, r0 +(p0) fnorm.s1 FR_X_Prime = FR_Input_X +(p0) cmp.eq.unc p15, p0 = r0, r0 ;; +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fadd FR_Em1 = f0,f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd FR_E = f0,f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} + + +L(LOG_BEGIN): + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl GR_Table_Scale = 0x0000000000000018 ;; +} + +{ .mmi + nop.m 999 +// +// Create E = 1 and Em1 = 0 +// Check for X == 0, meaning log(1+0) +// Check for X < -1, meaning log(negative) +// Check for X == -1, meaning log(0) +// Normalize x +// Identify NatVals, NaNs, Infs. +// Identify EM unsupporteds. +// Identify Negative values - us S1 so as +// not to raise denormal operand exception +// Set p15 to true for log1pf +// Set p14 to false for log1pf +// Set p7 true for log and log1pf +// +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E + nop.i 999 ;; +} + +{ .mfi + ld8 GR_Table_Base = [GR_Table_Base] +(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E + nop.i 999 +} + +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 +// +// Begin load of constants base +// FR_Z = Z = |x| + E +// FR_W = W = |x| + Em1 +// AA = fmax(|x|,E) +// BB = fmin(|x|,E) +// +(p6) br.cond.spnt L(LOG_64_special) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt L(LOG_64_unsupported) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.spnt L(LOG_64_negative) ;; +} + +{ .mib +(p0) getf.sig GR_signif = FR_Z + nop.i 999 +(p9) br.cond.spnt L(LOG_64_one) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(LOG_64_zero) ;; +} + +{ .mfi +(p0) getf.exp GR_N = FR_Z +// +// Raise possible denormal operand exception +// Create Bias +// +// This function computes ln( x + e ) +// Input FR 1: FR_X = FR_Input_X +// Input FR 2: FR_E = FR_E +// Input FR 3: FR_Em1 = FR_Em1 +// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 +// Output FR 4: FR_Y_hi +// Output FR 5: FR_Y_lo +// Output FR 6: FR_Scale +// Output PR 7: PR_Safe +// +(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z +// +// signif = getf.sig(Z) +// abs_W = fabs(w) +// +(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.se FR_S_hi = f1,FR_Z +(p0) extr.u GR_X_0 = GR_signif, 49, 15 +} + +{ .mmi + nop.m 999 +(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp + nop.i 999 +} +;; + +{ .mlx + ld8 GR_Table_Base1 = [GR_Table_Base1] +(p0) movl GR_Bias = 0x000000000000FFFF ;; +} + +{ .mfi + nop.m 999 +(p0) fabs FR_abs_W = FR_W +(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 +} + +{ .mfi + nop.m 999 +// +// Branch out for special input values +// +(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// X_0 = extr.u(signif,49,15) +// Index1 = extr.u(signif,59,4) +// +(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Offset_to_Z1 = 24 * Index1 +// For performance, don't use result +// for 3 or 4 cycles. +// +(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; +} +// +// Add Base to Offset for Z1 +// Create Bias + +{ .mmi +(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; +(p0) ldfs FR_G = [GR_Table_ptr],4 + nop.i 999 ;; +} + +{ .mmi +(p0) ldfs FR_H = [GR_Table_ptr],8 ;; +(p0) ldfd FR_h = [GR_Table_ptr],0 +(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 +} +// +// Load Z_1 +// Get Base of Table2 +// + +{ .mfi +(p0) getf.exp GR_M = FR_abs_W + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// M = getf.exp(abs_W) +// S_lo = AA - Z +// X_1 = pmpyshr2(X_0,Z_1,15) +// +(p0) sub GR_M = GR_M, GR_Bias ;; +} +// +// M = M - Bias +// Load G1 +// N = getf.exp(Z) +// + +{ .mii +(p0) cmp.gt.unc p11, p0 = -80, GR_M +(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; +(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +} + +{ .mib + nop.m 999 +// +// if -80 > M, set p11 +// Index2 = extr.u(X_1,6,4) +// if -7 > M, set p12 +// Load H1 +// +(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 +(p11) br.cond.spnt L(log1pf_small) ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.spnt L(log1pf_near) ;; +} + +{ .mii +(p0) sub GR_N = GR_N, GR_Bias +// +// poly_lo = r * poly_lo +// +(p0) add GR_Perturb = 0x1, r0 ;; +(p0) sub GR_ScaleN = GR_Bias, GR_N +} + +{ .mii +(p0) setf.sig FR_float_N = GR_N + nop.i 999 ;; +// +// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) +// Load h1 +// S_lo = S_lo + BB +// Branch for -80 > M +// +(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 +} + +{ .mmi +(p0) setf.exp FR_two_negN = GR_ScaleN + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp +};; + +// +// Index2 points to Z2 +// Branch for -7 > M +// + +{ .mmb +(p0) ld4 GR_Z_2 = [GR_Index2],4 + ld8 GR_Table_Base = [GR_Table_Base] + nop.b 999 ;; +} +(p0) nop.i 999 +// +// Load Z_2 +// N = N - Bias +// Tablebase points to Table3 +// + +{ .mmi +(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; +// +// Load G_2 +// pmpyshr2 X_2= (X_1,Z_2,15) +// float_N = setf.sig(N) +// ScaleN = Bias - N +// +(p0) ldfs FR_H_tmp = [GR_Index2],8 + nop.i 999 ;; +} +// +// Load H_2 +// two_negN = setf.exp(scaleN) +// G = G_1 * G_2 +// + +{ .mfi +(p0) ldfd FR_h_tmp = [GR_Index2],0 + nop.f 999 +(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; +} + +{ .mii + nop.m 999 +(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +// +// Load h_2 +// H = H_1 + H_2 +// h = h_1 + h_2 +// Index3 = extr.u(X_2,1,5) +// +(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base +} + +{ .mmi + nop.m 999 + nop.m 999 +// +// float_N = fcvt.xf(float_N) +// load G3 +// +(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; +} + +{ .mfi +ld8 GR_Table_Base = [GR_Table_Base] +nop.f 999 +nop.i 999 +} ;; + +{ .mfi +(p0) ldfe FR_log2_hi = [GR_Table_Base],16 +(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN + nop.i 999 ;; +} + +{ .mmf + nop.m 999 +// +// G = G3 * G +// Load h3 +// Load log2_hi +// H = H + H3 +// +(p0) ldfe FR_log2_lo = [GR_Table_Base],16 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; +} + +{ .mmf +(p0) ldfs FR_G_tmp = [GR_Index3],4 +// +// h = h + h3 +// r = G * S_hi + 1 +// Load log2_lo +// +(p0) ldfe FR_Q4 = [GR_Table_Base],16 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; +} + +{ .mfi +(p0) ldfe FR_Q3 = [GR_Table_Base],16 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} + +{ .mmf +(p0) ldfs FR_H_tmp = [GR_Index3],4 +(p0) ldfe FR_Q2 = [GR_Table_Base],16 +// +// Comput Index for Table3 +// S_lo = S_lo * two_negN +// +(p0) fcvt.xf FR_float_N = FR_float_N ;; +} +// +// If S_lo == 0, set p8 false +// Load H3 +// Load ptr to table of polynomial coeff. +// + +{ .mmf +(p0) ldfd FR_h_tmp = [GR_Index3],0 +(p0) ldfe FR_Q1 = [GR_Table_Base],0 +(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; +} + +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Load Q4 +// Load Q3 +// Load Q2 +// Load Q1 +// +(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// poly_lo = r * Q4 + Q3 +// rsq = r* r +// +(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// If (S_lo!=0) r = s_lo * G + r +// +(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 + nop.i 999 +} +// +// Create a 0x00000....01 +// poly_lo = poly_lo * rsq + h +// + +{ .mfi +(p0) setf.sig FR_dummy = GR_Perturb +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// h = N * log2_lo + h +// Y_hi = n * log2_hi + H +// +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// poly_lo = r * poly_o + Q2 +// poly_hi = Q1 * rsq + r +// +(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo +// +// Create the FR for a binary "or" +// Y_lo = poly_hi + poly_lo +// +// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; +// +// Turn the lsb of Y_lo ON +// +// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; +// +// Merge the new lsb into Y_lo, for alone doesn't +// +(p0) br.cond.sptk L(LOG_main) ;; +} + + +L(log1pf_near): + +{ .mmi + nop.m 999 + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1pf_near ************************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; +} +// +// Load base address of poly. coeff. +// +{.mmi + nop.m 999 + ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mmb +(p0) add GR_Table_ptr = 0x40,GR_Table_Base +// +// Address tables with separate pointers +// +(p0) ldfe FR_P8 = [GR_Table_Base],16 + nop.b 999 ;; +} + +{ .mmb +(p0) ldfe FR_P4 = [GR_Table_ptr],16 +// +// Load P4 +// Load P8 +// +(p0) ldfe FR_P7 = [GR_Table_Base],16 + nop.b 999 ;; +} + +{ .mmf +(p0) ldfe FR_P3 = [GR_Table_ptr],16 +// +// Load P3 +// Load P7 +// +(p0) ldfe FR_P6 = [GR_Table_Base],16 +(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; +} + +{ .mfi +(p0) ldfe FR_P2 = [GR_Table_ptr],16 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 + nop.i 999 +} +// +// Load P2 +// Load P6 +// Wsq = w * w +// Y_hi = p4 * w + p3 +// + +{ .mfi +(p0) ldfe FR_P5 = [GR_Table_Base],16 +(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 + nop.i 999 ;; +} + +{ .mfi +(p0) ldfe FR_P1 = [GR_Table_ptr],16 +// +// Load P1 +// Load P5 +// Y_lo = p8 * w + P7 +// +(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 +(p0) add GR_Perturb = 0x1, r0 ;; +} + +{ .mfi + nop.m 999 +// +// w4 = w2 * w2 +// Y_hi = y_hi * w + p2 +// Y_lo = y_lo * w + p6 +// Create perturbation bit +// +(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 + nop.i 999 +} +// +// Y_hi = y_hi * w + p1 +// w6 = w4 * w2 +// + +{ .mfi +(p0) setf.sig FR_Q4 = GR_Perturb +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W + nop.i 999 +} + +{ .mfb + nop.m 999 +// +// Y_hi = y_hi * wsq + w +// Y_lo = y_lo * w + p5 +// +(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo +// +// Y_lo = y_lo * w6 +// +// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; +// +// Set lsb on: Taken out to improve performance +// +// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; +// +// Make sure it's on in Y_lo also. Taken out to improve +// performance +// +(p0) br.cond.sptk L(LOG_main) ;; +} + + +L(log1pf_small): + +{ .mmi + nop.m 999 + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1pf_small ***********************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp +} + +{ .mfi + nop.m 999 +(p0) mov FR_Em1 = FR_W +(p0) cmp.eq.unc p7, p0 = r0, r0 ;; +} + +{ .mlx + ld8 GR_Table_Base = [GR_Table_Base] +(p0) movl GR_Expo_Range = 0x0000000000000002 ;; +} +// +// Set Safe to true +// Set Expo_Range = 0 for single +// Set Expo_Range = 2 for double +// Set Expo_Range = 4 for double-extended +// + +{ .mmi +(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; +(p0) ldfe FR_Threshold = [GR_Table_Base],16 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl GR_Bias = 0x000000000000FF9B ;; +} + +{ .mfi +(p0) ldfe FR_Tiny = [GR_Table_Base],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W + nop.i 999 +} + +{ .mfi + nop.m 999 +(p13) fadd FR_SCALE = f0, f1 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny +(p12) cmp.ne.unc p7, p0 = r0, r0 +} + +{ .mfi +(p12) setf.exp FR_SCALE = GR_Bias + nop.f 999 + nop.i 999 ;; +} + +// +// Set p7 to SAFE = FALSE +// Set Scale = 2^-100 +// +{ .mfb + nop.m 999 +(p0) fma.s.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi +(p0) br.ret.sptk b0 +} +;; + +L(LOG_64_one): + +{ .mfb + nop.m 999 +(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0 +(p0) br.ret.sptk b0 +} +;; +// +// Raise divide by zero for +/-0 input. +// + +L(LOG_64_zero): + +{ .mfi +(p0) mov GR_Parameter_TAG = 142 +// +// If we have log1pf(0), return -Inf. +// +(p0) fsub.s0 FR_Output_X_tmp = f0, f1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 +(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +} + +L(LOG_64_special): + +{ .mfi + nop.m 999 +// +// Return -Inf or value from handler. +// +(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Check for Natval, QNan, SNaN, +Inf +// +(p7) fmpy.s.s0 f8 = FR_Input_X, f1 +// +// For SNaN raise invalid and return QNaN. +// For QNaN raise invalid and return QNaN. +// For +Inf return +Inf. +// +(p7) br.ret.sptk b0 +} +;; + +// +// For -Inf raise invalid and return QNaN. +// + +{ .mfb +(p0) mov GR_Parameter_TAG = 143 +(p0) fmpy.s.s0 FR_Output_X_tmp = FR_Input_X, f0 +(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +} + +// +// Report that log1pf(-Inf) computed +// + +L(LOG_64_unsupported): + +// +// Return generated NaN or other value . +// + +{ .mfb + nop.m 999 +(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0 +(p0) br.ret.sptk b0 ;; +} + +L(LOG_64_negative): + +{ .mfi + nop.m 999 +// +// Deal with x < 0 in a special way +// +(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 +// +// Deal with x < 0 in a special way - raise +// invalid and produce QNaN indefinite. +// +(p0) mov GR_Parameter_TAG = 143;; +} + +.endp log1pf# +ASM_SIZE_DIRECTIVE(log1pf) + +.proc __libm_error_region +__libm_error_region: +L(LOG_ERROR_Support): +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfs [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfs FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.proc __libm_LOG_main +__libm_LOG_main: +L(LOG_main): + +// +// kernel_log_64 computes ln(X + E) +// + +{ .mfi + nop.m 999 +(p7) fadd.s.s0 FR_Input_X = FR_Y_lo,FR_Y_hi + nop.i 999 +} + +{ .mmi + nop.m 999 + nop.m 999 +(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; +} + +{ .mmi + nop.m 999 +(p14) ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mmi +(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; +(p14) ldfe FR_1LN10_lo = [GR_Table_Base] + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p14) fma.s.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp +(p0) br.ret.sptk b0 ;; +} +.endp __libm_LOG_main +ASM_SIZE_DIRECTIVE(__libm_LOG_main) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_log1pl.S b/sysdeps/ia64/fpu/s_log1pl.S new file mode 100644 index 0000000000..54ef807fd1 --- /dev/null +++ b/sysdeps/ia64/fpu/s_log1pl.S @@ -0,0 +1,1663 @@ +.file "log1pl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// 2/02/00 hand-optimized +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// ********************************************************************* +// +// ********************************************************************* +// +// Function: Combined logl(x), log1pl(x), and log10l(x) where +// logl(x) = ln(x), for double-extended precision x values +// log1pl(x) = ln(x+1), for double-extended precision x values +// log10l(x) = log (x), for double-extended precision x values +// 10 +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f33-f55,f99 +// +// General Purpose Registers: +// r32-r53 +// r54-r57 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions cannot occur +// Underflow exceptions raised when appropriate for log1p +// (Error Handling Routine called for underflow) +// Inexact raised when appropriate by algorithm +// +// logl(inf) = inf +// logl(-inf) = QNaN +// logl(+/-0) = -inf +// logl(SNaN) = QNaN +// logl(QNaN) = QNaN +// logl(EM_special Values) = QNaN +// log1pl(inf) = inf +// log1pl(-inf) = QNaN +// log1pl(+/-0) = +/-0 +// log1pl(-1) = -inf +// log1pl(SNaN) = QNaN +// log1pl(QNaN) = QNaN +// log1pl(EM_special Values) = QNaN +// log10l(inf) = inf +// log10l(-inf) = QNaN +// log10l(+/-0) = -inf +// log10l(SNaN) = QNaN +// log10l(QNaN) = QNaN +// log10l(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Computation is based on the following kernel. +// +// ker_log_64( in_FR : X, +// in_FR : E, +// in_FR : Em1, +// in_GR : Expo_Range, +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : Scale, +// out_PR : Safe ) +// +// Overview +// +// The method consists of three cases. +// +// If |X+Em1| < 2^(-80) use case log1pl_small; +// elseif |X+Em1| < 2^(-7) use case log_near1; +// else use case log_regular; +// +// Case log1pl_small: +// +// logl( 1 + (X+Em1) ) can be approximated by (X+Em1). +// +// Case log_near1: +// +// logl( 1 + (X+Em1) ) can be approximated by a simple polynomial +// in W = X+Em1. This polynomial resembles the truncated Taylor +// series W - W^/2 + W^3/3 - ... +// +// Case log_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute logl(Arg) for an argument Arg in [1,2), we +// construct a value G such that G*Arg is close to 1 and that +// logl(1/G) is obtainable easily from a table of values calculated +// beforehand. Thus +// +// logl(Arg) = logl(1/G) + logl(G*Arg) +// = logl(1/G) + logl(1 + (G*Arg - 1)) +// +// Because |G*Arg - 1| is small, the second term on the right hand +// side can be approximated by a short polynomial. We elaborate +// this method in four steps. +// +// Step 0: Initialization +// +// We need to calculate logl( E + X ). Obtain N, S_hi, S_lo such that +// +// E + X = 2^N * ( S_hi + S_lo ) exactly +// +// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense +// that |S_lo| <= ulp(S_hi). +// +// Step 1: Argument Reduction +// +// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate +// +// G := G_1 * G_2 * G_3 +// r := (G * S_hi - 1) + G * S_lo +// +// These G_j's have the property that the product is exactly +// representable and that |r| < 2^(-12) as a result. +// +// Step 2: Approximation +// +// +// logl(1 + r) is approximated by a short polynomial poly(r). +// +// Step 3: Reconstruction +// +// +// Finally, logl( E + X ) is given by +// +// logl( E + X ) = logl( 2^N * (S_hi + S_lo) ) +// ~=~ N*logl(2) + logl(1/G) + logl(1 + r) +// ~=~ N*logl(2) + logl(1/G) + poly(r). +// +// **** Algorithm **** +// +// Case log1pl_small: +// +// Although logl(1 + (X+Em1)) is basically X+Em1, we would like to +// preserve the inexactness nature as well as consistent behavior +// under different rounding modes. Note that this case can only be +// taken if E is set to be 1.0. In this case, Em1 is zero, and that +// X can be very tiny and thus the final result can possibly underflow. +// Thus, we compare X against a threshold that is dependent on the +// input Expo_Range. If |X| is smaller than this threshold, we set +// SAFE to be FALSE. +// +// The result is returned as Y_hi, Y_lo, and in the case of SAFE +// is FALSE, an additional value Scale is also returned. +// +// W := X + Em1 +// Threshold := Threshold_Table( Expo_Range ) +// Tiny := Tiny_Table( Expo_Range ) +// +// If ( |W| > Threshold ) then +// Y_hi := W +// Y_lo := -W*W +// Else +// Y_hi := W +// Y_lo := -Tiny +// Scale := 2^(-100) +// Safe := FALSE +// EndIf +// +// +// One may think that Y_lo should be -W*W/2; however, it does not matter +// as Y_lo will be rounded off completely except for the correct effect in +// directed rounding. Clearly -W*W is simplier to compute. Moreover, +// because of the difference in exponent value, Y_hi + Y_lo or +// Y_hi + Scale*Y_lo is always inexact. +// +// Case log_near1: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into two portions. +// +// W := X + Em1 +// Wsq := W * W +// W4 := Wsq*Wsq +// W6 := W4*Wsq +// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) +// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) +// set lsb(Y_lo) to be 1 +// +// Case log_regular: +// +// We present the algorithm in four steps. +// +// Step 0. Initialization +// ---------------------- +// +// Z := X + E +// N := unbaised exponent of Z +// S_hi := 2^(-N) * Z +// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } +// +// Note that S_lo is always 0 for the case E = 0. +// +// Step 1. Argument Reduction +// -------------------------- +// +// Let +// +// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 +// +// We obtain G_1, G_2, G_3 by the following steps. +// +// +// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted +// from S_hi. +// +// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated +// to lsb = 2^(-4). +// +// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// +// Fetch Z_1 := (1/A_1) rounded UP in fixed point with +// fixed point lsb = 2^(-15). +// Z_1 looks like z_0.z_1 z_2 ... z_15 +// Note that the fetching is done using index_1. +// A_1 is actually not needed in the implementation +// and is used here only to explain how is the value +// Z_1 defined. +// +// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. +// floating pt. Again, fetching is done using index_1. A_1 +// explains how G_1 is defined. +// +// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 d_5 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_1 indeed always begin +// with 1.0000 in fixed point. +// +// +// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 +// truncated to lsb = 2^(-8). Similar to A_1, +// A_2 is not needed in actual implementation. It +// helps explain how some of the values are defined. +// +// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// +// Fetch Z_2 := (1/A_2) rounded UP in fixed point with +// fixed point lsb = 2^(-15). Fetch done using index_2. +// Z_2 looks like z_0.z_1 z_2 ... z_15 +// +// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. +// floating pt. +// +// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_2 indeed always begin +// with 1.00000000 in fixed point. +// +// +// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. +// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// +// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// +// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. +// floating pt. Fetch is done using index_3. +// +// Compute G := G_1 * G_2 * G_3. +// +// This is done exactly since each of G_j only has 21 sig. bits. +// +// Compute +// +// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// +// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of +// rounding errors. +// +// +// Step 2. Approximation +// --------------------- +// +// This step computes an approximation to logl( 1 + r ) where r is the +// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); +// thus logl(1+r) can be approximated by a short polynomial: +// +// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// +// +// Step 3. Reconstruction +// ---------------------- +// +// This step computes the desired result of logl(X+E): +// +// logl(X+E) = logl( 2^N * (S_hi + S_lo) ) +// = N*logl(2) + logl( S_hi + S_lo ) +// = N*logl(2) + logl(1/G) + +// logl(1 + C*(S_hi+S_lo) - 1 ) +// +// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: +// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are +// single-precision numbers and the low parts are double precision +// numbers. These have the property that +// +// N*log2_hi + SUM ( log1byGj_hi ) +// +// is computable exactly in double-extended precision (64 sig. bits). +// Finally +// +// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) +// Y_lo := poly_hi + [ poly_lo + +// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] +// set lsb(Y_lo) to be 1 +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif + +// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 + +.align 64 +Constants_P: +ASM_TYPE_DIRECTIVE(Constants_P,@object) +data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 +data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 +data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 +data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 +data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 +data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 +data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_P) + +// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 + +.align 64 +Constants_Q: +ASM_TYPE_DIRECTIVE(Constants_Q,@object) +data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 +data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 +data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 +data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 +data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 +data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Q) + +// Z1 - 16 bit fixed, G1 and H1 - IEEE single + +.align 64 +Constants_Z_G_H_h1: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 +data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 +data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF +data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 +data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 +data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) + +// Z2 - 16 bit fixed, G2 and H2 - IEEE single + +.align 64 +Constants_Z_G_H_h2: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) +data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 +data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) + +// G3 and H3 - IEEE single and h3 -IEEE double + +.align 64 +Constants_Z_G_H_h3: +ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) +data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 +data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 +data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D +data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 +data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 +data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 +data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 +data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 +data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E +data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D +data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 +data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 +data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC +data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 +data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B +data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B +data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 +data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 +data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 +data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC +data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 +data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 +data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 +data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B +data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E +data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 +data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 +data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 +data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B +data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 +data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 +data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 +ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) + +// +// Exponent Thresholds and Tiny Thresholds +// for 8, 11, 15, and 17 bit exponents +// +// Expo_Range Value +// +// 0 (8 bits) 2^(-126) +// 1 (11 bits) 2^(-1022) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// +// Tiny_Table +// ---------- +// Expo_Range Value +// +// 0 (8 bits) 2^(-16382) +// 1 (11 bits) 2^(-16382) +// 2 (15 bits) 2^(-16382) +// 3 (17 bits) 2^(-16382) +// + +.align 64 +Constants_Threshold: +ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) +data4 0x00000000,0x80000000,0x00003F81,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00003C01,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +data4 0x00000000,0x80000000,0x00000001,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_Threshold) + +.align 64 +Constants_1_by_LN10: +ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) +data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 +data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000 +ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) + +FR_Input_X = f8 +FR_Neg_One = f9 +FR_E = f33 +FR_Em1 = f34 +FR_Y_hi = f34 +// Shared with Em1 +FR_Y_lo = f35 +FR_Scale = f36 +FR_X_Prime = f37 +FR_Z = f38 +FR_S_hi = f38 +// Shared with Z +FR_W = f39 +FR_G = f40 +FR_wsq = f40 +// Shared with G +FR_H = f41 +FR_w4 = f41 +// Shared with H +FR_h = f42 +FR_w6 = f42 +// Shared with h +FR_G_tmp = f43 +FR_poly_lo = f43 +// Shared with G_tmp +FR_P8 = f43 +// Shared with G_tmp +FR_H_tmp = f44 +FR_poly_hi = f44 + // Shared with H_tmp +FR_P7 = f44 +// Shared with H_tmp +FR_h_tmp = f45 +FR_rsq = f45 +// Shared with h_tmp +FR_P6 = f45 +// Shared with h_tmp +FR_abs_W = f46 +FR_r = f46 +// Shared with abs_W +FR_AA = f47 +FR_log2_hi = f47 +// Shared with AA +FR_BB = f48 +FR_log2_lo = f48 +// Shared with BB +FR_S_lo = f49 +FR_two_negN = f50 +FR_float_N = f51 +FR_Q4 = f52 +FR_dummy = f52 +// Shared with Q4 +FR_P4 = f52 +// Shared with Q4 +FR_Threshold = f52 +// Shared with Q4 +FR_Q3 = f53 +FR_P3 = f53 +// Shared with Q3 +FR_Tiny = f53 +// Shared with Q3 +FR_Q2 = f54 +FR_P2 = f54 +// Shared with Q2 +FR_1LN10_hi = f54 +// Shared with Q2 +FR_Q1 = f55 +FR_P1 = f55 +// Shared with Q1 +FR_1LN10_lo = f55 +// Shared with Q1 +FR_P5 = f98 +FR_SCALE = f98 +FR_Output_X_tmp = f99 + +GR_Expo_Range = r32 +GR_Table_Base = r34 +GR_Table_Base1 = r35 +GR_Table_ptr = r36 +GR_Index2 = r37 +GR_signif = r38 +GR_X_0 = r39 +GR_X_1 = r40 +GR_X_2 = r41 +GR_Z_1 = r42 +GR_Z_2 = r43 +GR_N = r44 +GR_Bias = r45 +GR_M = r46 +GR_ScaleN = r47 +GR_Index3 = r48 +GR_Perturb = r49 +GR_Table_Scale = r50 + +// +// Added for unwind support +// + +GR_SAVE_PFS = r51 +GR_SAVE_B0 = r52 +GR_SAVE_GP = r53 +GR_Parameter_X = r54 +GR_Parameter_Y = r55 +GR_Parameter_RESULT = r56 +GR_Parameter_TAG = r57 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f99 + +.section .text +.proc logl# +.global logl# +.align 64 +logl: +#ifdef _LIBC +.global __ieee754_logl +__ieee754_logl: +#endif +{ .mfi +alloc r32 = ar.pfs,0,22,4,0 +(p0) fnorm.s1 FR_X_Prime = FR_Input_X +(p0) cmp.eq.unc p7, p0 = r0, r0 +} +{ .mfi +(p0) cmp.ne.unc p14, p0 = r0, r0 +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 +(p0) cmp.ne.unc p15, p0 = r0, r0 ;; +} +{ .mfi + nop.m 0 +(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF + nop.i 0 +} +{ .mfi +nop.m 999 +(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0 + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0 + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fsub.s1 FR_Em1 = f0,f1 + nop.i 999 +} +{ .mfb + nop.m 999 +(p0) fadd FR_E = f0,f0 +// +// Create E = 0 and Em1 = -1 +// Check for X == 1, meaning logl(1) +// Check for X < 0, meaning logl(negative) +// Check for X == 0, meaning logl(0) +// Identify NatVals, NaNs, Infs. +// Identify EM unsupporteds. +// Identify Negative values - us S1 so as +// not to raise denormal operand exception +// Set p15 to false for log +// Set p14 to false for log +// Set p7 true for log and log1p +// +(p0) br.cond.sptk L(LOGL_BEGIN) ;; +} + +.endp logl +ASM_SIZE_DIRECTIVE(logl) + +.section .text +.proc log10l# +.global log10l# +.align 64 +log10l: +#ifdef _LIBC +.global __ieee754_log10l +__ieee754_log10l: +#endif +{ .mfi +alloc r32 = ar.pfs,0,22,4,0 +(p0) fadd FR_E = f0,f0 + nop.i 0 +} +{ .mfi + nop.m 0 +(p0) fsub.s1 FR_Em1 = f0,f1 + nop.i 0 +} +{ .mfi +(p0) cmp.ne.unc p15, p0 = r0, r0 +(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1 + nop.i 0 +} +{ .mfi +(p0) cmp.eq.unc p14, p0 = r0, r0 +(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0 +(p0) cmp.ne.unc p7, p0 = r0, r0 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 + nop.i 999 +} +{ .mfb + nop.m 999 +(p0) fnorm.s1 FR_X_Prime = FR_Input_X +// +// Create E = 0 and Em1 = -1 +// Check for X == 1, meaning logl(1) +// Check for X < 0, meaning logl(negative) +// Check for X == 0, meaning logl(0) +// Identify NatVals, NaNs, Infs. +// Identify EM unsupporteds. +// Identify Negative values - us S1 so as +// Identify Negative values - us S1 so as +// not to raise denormal operand exception +// Set p15 to false for log10 +// Set p14 to true for log10 +// Set p7 to false for log10 +// +(p0) br.cond.sptk L(LOGL_BEGIN) ;; +} + +.endp log10l +ASM_SIZE_DIRECTIVE(log10l) + +.section .text +.proc log1pl# +.global log1pl# +.align 64 +log1pl: +#ifdef _LIBC +.global __log1pl +__log1pl: +#endif +{ .mfi +alloc r32 = ar.pfs,0,22,4,0 +(p0) fsub.s1 FR_Neg_One = f0,f1 +(p0) cmp.eq.unc p7, p0 = r0, r0 +} +{ .mfi +(p0) cmp.ne.unc p14, p0 = r0, r0 +(p0) fnorm.s1 FR_X_Prime = FR_Input_X +(p0) cmp.eq.unc p15, p0 = r0, r0 ;; +} +{ .mfi + nop.m 0 +(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 + nop.i 0 +} +{ .mfi + nop.m 999 +(p0) fadd FR_Em1 = f0,f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd FR_E = f0,f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One + nop.i 999 +} +L(LOGL_BEGIN): +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_Table_Scale = 0x0000000000000018 ;; +} +{ .mmi + nop.m 999 + nop.m 999 +// +// Create E = 1 and Em1 = 0 +// Check for X == 0, meaning logl(1+0) +// Check for X < -1, meaning logl(negative) +// Check for X == -1, meaning logl(0) +// Normalize x +// Identify NatVals, NaNs, Infs. +// Identify EM unsupporteds. +// Identify Negative values - us S1 so as +// not to raise denormal operand exception +// Set p15 to true for log1p +// Set p14 to false for log1p +// Set p7 true for log and log1p +// +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp +} +{ .mfi + nop.m 999 +(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E + nop.i 999 ;; +} +{ .mfi + ld8 GR_Table_Base = [GR_Table_Base] +(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E + nop.i 999 +} +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 +// +// Begin load of constants base +// FR_Z = Z = |x| + E +// FR_W = W = |x| + Em1 +// AA = fmax(|x|,E) +// BB = fmin(|x|,E) +// +(p6) br.cond.spnt L(LOGL_64_special) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt L(LOGL_64_unsupported) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p13) br.cond.spnt L(LOGL_64_negative) ;; +} +{ .mib +(p0) getf.sig GR_signif = FR_Z + nop.i 999 +(p9) br.cond.spnt L(LOGL_64_one) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.spnt L(LOGL_64_zero) ;; +} +{ .mfi +(p0) getf.exp GR_N = FR_Z +// +// Raise possible denormal operand exception +// Create Bias +// +// This function computes ln( x + e ) +// Input FR 1: FR_X = FR_Input_X +// Input FR 2: FR_E = FR_E +// Input FR 3: FR_Em1 = FR_Em1 +// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 +// Output FR 4: FR_Y_hi +// Output FR 5: FR_Y_lo +// Output FR 6: FR_Scale +// Output PR 7: PR_Safe +// +(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z +// +// signif = getf.sig(Z) +// abs_W = fabs(w) +// +(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; +} +{ .mfi + nop.m 999 +(p0) fmerge.se FR_S_hi = f1,FR_Z +(p0) extr.u GR_X_0 = GR_signif, 49, 15 +} +{ .mmi + nop.m 999 + nop.m 999 +(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp ;; +} +{ .mlx + ld8 GR_Table_Base1 = [GR_Table_Base1] +(p0) movl GR_Bias = 0x000000000000FFFF ;; +} +{ .mfi + nop.m 999 +(p0) fabs FR_abs_W = FR_W +(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 +} +{ .mfi + nop.m 999 +// +// Branch out for special input values +// +(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// X_0 = extr.u(signif,49,15) +// Index1 = extr.u(signif,59,4) +// +(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Offset_to_Z1 = 24 * Index1 +// For performance, don't use result +// for 3 or 4 cycles. +// +(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; +} +// +// Add Base to Offset for Z1 +// Create Bias +{ .mmi +(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; +(p0) ldfs FR_G = [GR_Table_ptr],4 + nop.i 999 ;; +} +{ .mmi +(p0) ldfs FR_H = [GR_Table_ptr],8 ;; +(p0) ldfd FR_h = [GR_Table_ptr],0 +(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 +} +// +// Load Z_1 +// Get Base of Table2 +// +{ .mfi +(p0) getf.exp GR_M = FR_abs_W + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// M = getf.exp(abs_W) +// S_lo = AA - Z +// X_1 = pmpyshr2(X_0,Z_1,15) +// +(p0) sub GR_M = GR_M, GR_Bias ;; +} +// +// M = M - Bias +// Load G1 +// N = getf.exp(Z) +// +{ .mii +(p0) cmp.gt.unc p11, p0 = -80, GR_M +(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; +(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +} +{ .mib + nop.m 999 +// +// if -80 > M, set p11 +// Index2 = extr.u(X_1,6,4) +// if -7 > M, set p12 +// Load H1 +// +(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 +(p11) br.cond.spnt L(log1pl_small) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p12) br.cond.spnt L(log1pl_near) ;; +} +{ .mii +(p0) sub GR_N = GR_N, GR_Bias +// +// poly_lo = r * poly_lo +// +(p0) add GR_Perturb = 0x1, r0 ;; +(p0) sub GR_ScaleN = GR_Bias, GR_N +} +{ .mii +(p0) setf.sig FR_float_N = GR_N + nop.i 999 ;; +// +// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) +// Load h1 +// S_lo = S_lo + BB +// Branch for -80 > M +// +(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 +} +{ .mmi +(p0) setf.exp FR_two_negN = GR_ScaleN + nop.m 999 +(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp ;; +} +// +// Index2 points to Z2 +// Branch for -7 > M +// +{ .mmb +(p0) ld4 GR_Z_2 = [GR_Index2],4 +(p0) ld8 GR_Table_Base = [GR_Table_Base] + nop.b 999 ;; +} +(p0) nop.i 999 +// +// Load Z_2 +// N = N - Bias +// Tablebase points to Table3 +// +{ .mmi +(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; +// +// Load G_2 +// pmpyshr2 X_2= (X_1,Z_2,15) +// float_N = setf.sig(N) +// ScaleN = Bias - N +// +(p0) ldfs FR_H_tmp = [GR_Index2],8 + nop.i 999 ;; +} +// +// Load H_2 +// two_negN = setf.exp(scaleN) +// G = G_1 * G_2 +// +{ .mfi +(p0) ldfd FR_h_tmp = [GR_Index2],0 + nop.f 999 +(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; +} +{ .mii + nop.m 999 +(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +// +// Load h_2 +// H = H_1 + H_2 +// h = h_1 + h_2 +// Index3 = extr.u(X_2,1,5) +// +(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base +} +{ .mmi + nop.m 999 + nop.m 999 +// +// float_N = fcvt.xf(float_N) +// load G3 +// +(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; +} +{ .mmi + nop.m 999 + ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mfi +(p0) ldfe FR_log2_hi = [GR_Table_Base],16 +(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN + nop.i 999 ;; +} +{ .mmf + nop.m 999 +// +// G = G3 * G +// Load h3 +// Load log2_hi +// H = H + H3 +// +(p0) ldfe FR_log2_lo = [GR_Table_Base],16 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; +} +{ .mmf +(p0) ldfs FR_G_tmp = [GR_Index3],4 +// +// h = h + h3 +// r = G * S_hi + 1 +// Load log2_lo +// +(p0) ldfe FR_Q4 = [GR_Table_Base],16 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; +} +{ .mfi +(p0) ldfe FR_Q3 = [GR_Table_Base],16 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} +{ .mmf +(p0) ldfs FR_H_tmp = [GR_Index3],4 +(p0) ldfe FR_Q2 = [GR_Table_Base],16 +// +// Comput Index for Table3 +// S_lo = S_lo * two_negN +// +(p0) fcvt.xf FR_float_N = FR_float_N ;; +} +// +// If S_lo == 0, set p8 false +// Load H3 +// Load ptr to table of polynomial coeff. +// +{ .mmf +(p0) ldfd FR_h_tmp = [GR_Index3],0 +(p0) ldfe FR_Q1 = [GR_Table_Base],0 +(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_H = FR_H, FR_H_tmp + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fadd.s1 FR_h = FR_h, FR_h_tmp + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Load Q4 +// Load Q3 +// Load Q2 +// Load Q1 +// +(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r + nop.i 999 +} +{ .mfi + nop.m 999 +// +// poly_lo = r * Q4 + Q3 +// rsq = r* r +// +(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// If (S_lo!=0) r = s_lo * G + r +// +(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 + nop.i 999 +} +// +// Create a 0x00000....01 +// poly_lo = poly_lo * rsq + h +// +{ .mfi +(p0) setf.sig FR_dummy = GR_Perturb +(p0) fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// h = N * log2_lo + h +// Y_hi = n * log2_hi + H +// +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// poly_lo = r * poly_o + Q2 +// poly_hi = Q1 * rsq + r +// +(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo +// +// Create the FR for a binary "or" +// Y_lo = poly_hi + poly_lo +// +// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; +// +// Turn the lsb of Y_lo ON +// +// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; +// +// Merge the new lsb into Y_lo, for alone doesn't +// +(p0) br.cond.sptk LOGL_main ;; +} +L(log1pl_near): +{ .mmi + nop.m 999 + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1pl_near ************************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; +} +{ .mmi + nop.m 999 + ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; +// +// Load base address of poly. coeff. +// +{ .mmb +(p0) add GR_Table_ptr = 0x40,GR_Table_Base +// +// Address tables with separate pointers +// +(p0) ldfe FR_P8 = [GR_Table_Base],16 + nop.b 999 ;; +} +{ .mmb +(p0) ldfe FR_P4 = [GR_Table_ptr],16 +// +// Load P4 +// Load P8 +// +(p0) ldfe FR_P7 = [GR_Table_Base],16 + nop.b 999 ;; +} +{ .mmf +(p0) ldfe FR_P3 = [GR_Table_ptr],16 +// +// Load P3 +// Load P7 +// +(p0) ldfe FR_P6 = [GR_Table_Base],16 +(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; +} +{ .mfi +(p0) ldfe FR_P2 = [GR_Table_ptr],16 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 + nop.i 999 +} +// +// Load P2 +// Load P6 +// Wsq = w * w +// Y_hi = p4 * w + p3 +// +{ .mfi +(p0) ldfe FR_P5 = [GR_Table_Base],16 +(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 + nop.i 999 ;; +} +{ .mfi +(p0) ldfe FR_P1 = [GR_Table_ptr],16 +// +// Load P1 +// Load P5 +// Y_lo = p8 * w + P7 +// +(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 +(p0) add GR_Perturb = 0x1, r0 ;; +} +{ .mfi + nop.m 999 +// +// w4 = w2 * w2 +// Y_hi = y_hi * w + p2 +// Y_lo = y_lo * w + p6 +// Create perturbation bit +// +(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 + nop.i 999 +} +// +// Y_hi = y_hi * w + p1 +// w6 = w4 * w2 +// +{ .mfi +(p0) setf.sig FR_Q4 = GR_Perturb +(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_dummy = FR_wsq,FR_Y_hi, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fma.s1 FR_Y_hi = FR_W,f1,f0 + nop.i 999 +};; +{ .mfb + nop.m 999 +// +// Y_hi = w +// Y_lo = y_lo * w + p5 +// +(p0) fma.s1 FR_Y_lo = FR_w6, FR_Y_lo,FR_dummy +// +// Y_lo = y_lo * w6 + y_high order part. +// +// performance +// +(p0) br.cond.sptk LOGL_main ;; +} +L(log1pl_small): +{ .mmi + nop.m 999 +// /*******************************************************/ +// /*********** Branch log1pl_small ***********************/ +// /*******************************************************/ +(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp +} +{ .mfi + nop.m 999 +(p0) mov FR_Em1 = FR_W +(p0) cmp.eq.unc p7, p0 = r0, r0 ;; +} +{ .mlx + ld8 GR_Table_Base = [GR_Table_Base] +(p0) movl GR_Expo_Range = 0x0000000000000004 ;; +} +// +// Set Safe to true +// Set Expo_Range = 0 for single +// Set Expo_Range = 2 for double +// Set Expo_Range = 4 for double-extended +// +{ .mmi +(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; +(p0) ldfe FR_Threshold = [GR_Table_Base],16 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl GR_Bias = 0x000000000000FF9B ;; +} +{ .mfi +(p0) ldfe FR_Tiny = [GR_Table_Base],0 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fadd FR_SCALE = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny +(p12) cmp.ne.unc p7, p0 = r0, r0 +} +{ .mfi +(p12) setf.exp FR_SCALE = GR_Bias + nop.f 999 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Set p7 to SAFE = FALSE +// Set Scale = 2^-100 +// +(p0) fma.s0 f8 = FR_Y_lo,FR_SCALE,FR_Y_hi +(p0) br.ret.sptk b0 ;; +} +L(LOGL_64_one): +{ .mfb + nop.m 999 +(p0) fmpy.s0 f8 = FR_Input_X, f0 +(p0) br.ret.sptk b0 ;; +} +// +// Raise divide by zero for +/-0 input. +// +L(LOGL_64_zero): +{ .mfi +(p0) mov GR_Parameter_TAG = 0 +// +// If we have logl(1), log10l(1) or log1pl(0), return 0. +// +(p0) fsub.s0 FR_Output_X_tmp = f0, f1 + nop.i 999 ;; +} +{ .mii +(p14) mov GR_Parameter_TAG = 6 + nop.i 999 ;; +(p15) mov GR_Parameter_TAG = 138 ;; +} +{ .mfb + nop.m 999 +(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 +(p0) br.cond.sptk __libm_error_region ;; +} +{ .mfb + nop.m 999 +// +// Report that logl(0) computed +// { .mfb +(p0) mov FR_Input_X = FR_Output_X_tmp +(p0) br.ret.sptk b0 ;; +} + +L(LOGL_64_special): +{ .mfi + nop.m 999 +// +// Return -Inf or value from handler. +// +(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Check for Natval, QNan, SNaN, +Inf +// +(p7) fmpy.s0 f8 = FR_Input_X, f1 +// +// For SNaN raise invalid and return QNaN. +// For QNaN raise invalid and return QNaN. +// For +Inf return +Inf. +// +(p7) br.ret.sptk b0 ;; +} +// +// For -Inf raise invalid and return QNaN. +// +{ .mii +(p0) mov GR_Parameter_TAG = 1 + nop.i 999 ;; +(p14) mov GR_Parameter_TAG = 7 ;; +} +{ .mfi +(p15) mov GR_Parameter_TAG = 139 + nop.f 999 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p0) fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 +(p0) br.cond.sptk __libm_error_region ;; +} +// +// Report that logl(-Inf) computed +// Report that log10l(-Inf) computed +// Report that log1p(-Inf) computed +// +{ .mfb + nop.m 0 +(p0) mov FR_Input_X = FR_Output_X_tmp +(p0) br.ret.sptk b0 ;; +} +L(LOGL_64_unsupported): +{ .mfb + nop.m 999 +// +// Return generated NaN or other value . +// +(p0) fmpy.s0 f8 = FR_Input_X, f0 +(p0) br.ret.sptk b0 ;; +} +L(LOGL_64_negative): +{ .mfi + nop.m 999 +// +// Deal with x < 0 in a special way +// +(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 +// +// Deal with x < 0 in a special way - raise +// invalid and produce QNaN indefinite. +// +(p0) mov GR_Parameter_TAG = 1 ;; +} +{ .mii +(p14) mov GR_Parameter_TAG = 7 + nop.i 999 ;; +(p15) mov GR_Parameter_TAG = 139 +} +.endp log1pl +ASM_SIZE_DIRECTIVE(log1pl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.proc LOGL_main +LOGL_main: +{ .mfi + nop.m 999 +// +// kernel_log_64 computes ln(X + E) +// +(p7) fadd.s0 FR_Input_X = FR_Y_lo,FR_Y_hi + nop.i 0 +} +{ .mmi + nop.m 999 + nop.m 999 +(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; +} +{ .mmi + nop.m 999 +(p14) ld8 GR_Table_Base = [GR_Table_Base] + nop.i 999 +};; + +{ .mmi +(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; +(p14) ldfe FR_1LN10_lo = [GR_Table_Base] + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p14) fma.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp +(p0) br.ret.sptk b0 ;; +} +.endp LOGL_main +ASM_SIZE_DIRECTIVE(LOGL_main) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_logb.S b/sysdeps/ia64/fpu/s_logb.S new file mode 100644 index 0000000000..d24f1f6497 --- /dev/null +++ b/sysdeps/ia64/fpu/s_logb.S @@ -0,0 +1,314 @@ +.file "logb.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/16/00 Modified to conform to C9X +// 3/16/00 Improved speed +// 4/04/00 Unwind support added +// 5/30/00 Fixed bug when x double-extended denormal +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// double logb( double x); +// +// Overview of operation +//============================================================== +// The logb function extracts the exponent of x as an integer in +// floating-point format. +// logb computes log2 of x as a double +// +// logb is similar to ilogb but differs in the following ways: +// +-inf +// ilogb: returns INT_MAX +// logb: returns +inf +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) +// ilogb: returns INT_MAX (7fffffff) +// logb: returns QNAN (quietized SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogb: returns -INT_MAX (80000001) +// logb: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// +// Registers used +//============================================================== +// general registers used: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path +// +// predicate registers used: +// p6, p7, p8 +// floating-point registers used: +// f9, f10, f11 +// f8, input + +#include "libm_support.h" + +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_SAVE_PFS = r32 + +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 + +.align 32 +.global logb# + +.section .text +.proc logb# +.align 32 + + +logb: + +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 +// 0 b +{ .mfi + alloc r32=ar.pfs,1,5,4,0 +(p0) fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 +} +// X NORMAL +// r37 = exp(f8) - - 0xffff +// sig(f8) = r37 +// f8 = convert_to_fp (sig)) +{ .mfi +(p0) getf.exp r35 = f8 +(p0) fnorm f10=f8 + nop.i 999 ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +{ .mmf +(p0) mov r33 = 0xffff +(p0) mov r34 = 0x1ffff +(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +} + +{ .mfb +(p0) and r36 = r35, r34 +(p0) fclass.m.unc p7,p0 = f8, 0x07 +(p8) br.cond.spnt L(LOGB_DENORM) ;; +} + +{ .mib +(p0) sub r37 = r36, r33 + nop.i 999 +(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +} + +{ .mib +(p0) setf.sig f9 = r37 + nop.i 999 +(p7) br.cond.spnt L(LOGB_ZERO) ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.d f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_DENORM): +// Form signexp of 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi +(p0) mov r38 = 0x1003f +(p0) fclass.m.unc p8,p0 = f10, 0x0b + nop.i 999 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f11 = r38 + nop.f 999 + nop.i 999 ;; +} + +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add r33 = 64, r33 +(p8) fmpy f10 = f10, f11 + nop.i 999 ;; +} + +// Logic is the same as normal path but use normalized input +{ .mmi +(p0) getf.exp r35 = f10 ;; + nop.m 999 + nop.i 999 ;; +} + +{ .mmi +(p0) and r36 = r35, r34 ;; +(p0) sub r37 = r36, r33 + nop.i 999 ;; +} + +{ .mmi +(p0) setf.sig f9 = r37 + nop.m 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.d f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_NAN_INF): + +// X NAN or INFINITY, return f8 * f8 +{ .mfb + nop.m 999 +(p0) fma.d f8= f8,f8,f0 +(p0) br.ret.sptk b0 ;; +} + +.endp logb# +ASM_SIZE_DIRECTIVE(logb) + +// Stack operations when calling error support. +// (1) (2) (3) (call) (4) +// sp -> + psp -> + psp -> + sp -> + +// | | | | +// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 +// | | | | +// | <-GR_Y Y2->| Y2 ->| <- GR_Y | +// | | | | +// | | <- GR_X X1 ->| | +// | | | | +// sp-64 -> + sp -> + sp -> + + +// save ar.pfs save b0 restore gp +// save gp restore ar.pfs + + + +.proc __libm_error_region +__libm_error_region: +L(LOGB_ZERO): +.prologue + +// f9 = |f8| +// f10 = -f9 = -|f8| +// f9 = 1.0/f10 = -1.0/-|f8| + +{ .mfi + mov r41 = 151 // Error code +(p0) fmerge.s f9 = f0,f8 + nop.i 999 +} +;; + + +{ .mfi + nop.m 999 + fmerge.ns f10 = f0,f9 + nop.i 999 +} +;; + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + frcpa f9,p6 = f1,f10 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f9 // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_logbf.S b/sysdeps/ia64/fpu/s_logbf.S new file mode 100644 index 0000000000..d3068470d0 --- /dev/null +++ b/sysdeps/ia64/fpu/s_logbf.S @@ -0,0 +1,301 @@ +.file "logbf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/16/00 Modified to conform to C9X +// 3/16/00 Improved speed +// 4/04/00 Unwind support added +// 5/30/00 Fixed bug when x double-extended denormal +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// float logbf( float x); +// +// Overview of operation +//============================================================== +// The logbf function extracts the exponent of x as an integer in +// floating-point format. +// logbf computes log2 of x as a float + +// logbf is similar to ilogbf but differs in the following ways: +// +-inf +// ilogbf: returns INT_MAX +// logbf: returns +inf +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) +// ilogbf: returns INT_MAX (7fffffff) +// logbf: returns QNAN (quietized SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogbf: returns -INT_MAX (80000001) +// logbf: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// +// Registers used +//============================================================== +// general registers used: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path +// +// predicate registers used: +// p6, p7, p8 +// +// floating-point registers used: +// f9, f10, f11 +// f8, input + +#include "libm_support.h" + +GR_SAVE_B0 = r34 +// r40 is address of table of coefficients +GR_SAVE_PFS = r32 +GR_SAVE_GP = r35 + +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f10 + + +.align 32 +.global logbf# + +.section .text +.proc logbf# +.align 32 + + +logbf: + +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 +// 0 b +{ .mfi + alloc r32=ar.pfs,1,5,4,0 +(p0) fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 +} +// X NORMAL +// r37 = exp(f8) - - 0xffff +// sig(f8) = r37 +// f8 = convert_to_fp (sig)) +{ .mfi +(p0) getf.exp r35 = f8 +(p0) fnorm f10=f8 + nop.i 999 ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +{ .mmf +(p0) mov r33 = 0xffff +(p0) mov r34 = 0x1ffff +(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +} + +{ .mfb +(p0) and r36 = r35, r34 +(p0) fclass.m.unc p7,p0 = f8, 0x07 +(p8) br.cond.spnt L(LOGB_DENORM) ;; +} + +{ .mib +(p0) sub r37 = r36, r33 + nop.i 999 +(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +} + +{ .mib +(p0) setf.sig f9 = r37 + nop.i 999 +(p7) br.cond.spnt L(LOGB_ZERO) ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.s f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_DENORM): +// Form signexp of 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi +(p0) mov r38 = 0x1003f +(p0) fclass.m.unc p8,p0 = f10, 0x0b + nop.i 999 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f11 = r38 + nop.f 999 + nop.i 999 ;; +} + +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add r33 = 64, r33 +(p8) fmpy f10 = f10, f11 + nop.i 999 ;; +} + +// Logic is the same as normal path but use normalized input +{ .mmi +(p0) getf.exp r35 = f10 ;; + nop.m 999 + nop.i 999 ;; +} + +{ .mmi +(p0) and r36 = r35, r34 ;; +(p0) sub r37 = r36, r33 + nop.i 999 ;; +} + +{ .mmi +(p0) setf.sig f9 = r37 + nop.m 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.s f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_NAN_INF): + +// X NAN or INFINITY, return f8 * f8 +{ .mfb + nop.m 999 +(p0) fma.s f8= f8,f8,f0 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_ZERO): + +// X ZERO +// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support +{ .mfi + nop.m 999 +(p0) fmerge.s f9 = f0,f8 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fmerge.ns f10 = f0,f9 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) frcpa f10,p6 = f1,f10 + nop.i 999 ;; +} + +.endp logbf +ASM_SIZE_DIRECTIVE(logbf) + + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mii + add GR_Parameter_Y=-32,sp // Parameter 2 value +(p0) mov GR_Parameter_TAG = 152 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_logbl.S b/sysdeps/ia64/fpu/s_logbl.S new file mode 100644 index 0000000000..e8275b221f --- /dev/null +++ b/sysdeps/ia64/fpu/s_logbl.S @@ -0,0 +1,286 @@ +.file "logbl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 2/16/00 Modified to conform to C9X +// 3/16/00 Improved speed +// 4/04/00 Unwind support added +// 5/30/00 Fixed bug when x double-extended denormal +// 8/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// +// API +//============================================================== +// long double logbl( long double x); +// +// Overview of operation +//============================================================== +// The logbl function extracts the exponent of x as an integer in +// floating-point format. +// logbl computes log2 of x as a long double +// +// logbl is similar to ilogbl but differs in the following ways: +// +-inf +// ilogbl: returns INT_MAX +// logbl: returns +inf +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) +// ilogbl: returns INT_MAX (7fffffff) +// logbl: returns QNAN (quietized SNAN) +// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) +// ilogbl: returns -INT_MAX (80000001) +// logbl: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// +// Registers used +//============================================================== +// general registers used: +// ar.pfs r32 +// r33 -> r37 +// r38 -> r41 used as parameters to error path +// +// predicate registers used: +// p6, p7, p8 +// +// floating-point registers used: +// f9, f10, f11 +// f8, input + +#include "libm_support.h" + +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r38 +GR_Parameter_Y = r39 +GR_Parameter_RESULT = r40 +GR_Parameter_TAG = r41 + +FR_X = f8 +FR_Y = f0 +FR_RESULT = f10 + +.align 32 +.global logbl# + +.section .text +.proc logbl# +.align 32 + + +logbl: + +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 +// 0 b +{ .mfi + alloc r32=ar.pfs,1,5,4,0 +(p0) fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 +} +// X NORMAL +// r37 = exp(f8) - - 0xffff +// sig(f8) = r37 +// f8 = convert_to_fp (sig)) +{ .mfi +(p0) getf.exp r35 = f8 +(p0) fnorm f10=f8 + nop.i 999 ;; +} + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 +// e 3 +{ .mmf +(p0) mov r33 = 0xffff +(p0) mov r34 = 0x1ffff +(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +} + +{ .mfb +(p0) and r36 = r35, r34 +(p0) fclass.m.unc p7,p0 = f8, 0x07 +(p8) br.cond.spnt L(LOGB_DENORM) ;; +} + +{ .mib +(p0) sub r37 = r36, r33 + nop.i 999 +(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +} + +{ .mib +(p0) setf.sig f9 = r37 + nop.i 999 +(p7) br.cond.spnt L(LOGB_ZERO) ;; +} +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_DENORM): +// Form signexp of 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi +(p0) mov r38 = 0x1003f +(p0) fclass.m.unc p8,p0 = f10, 0x0b + nop.i 999 ;; +} + +// Form 2^64 in case need to scale denormal +{ .mfi +(p0) setf.exp f11 = r38 + nop.f 999 + nop.i 999 ;; +} + +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add r33 = 64, r33 +(p8) fmpy f10 = f10, f11 + nop.i 999 ;; +} + +// Logic is the same as normal path but use normalized input +{ .mmi +(p0) getf.exp r35 = f10 ;; + nop.m 999 + nop.i 999 ;; +} + +{ .mmi +(p0) and r36 = r35, r34 ;; +(p0) sub r37 = r36, r33 + nop.i 999 ;; +} + +{ .mmi +(p0) setf.sig f9 = r37 + nop.m 999 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fcvt.xf f10 = f9 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm f8 = f10 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_NAN_INF): + +// X NAN or INFINITY, return f8 * f8 +{ .mfb + nop.m 999 +(p0) fma f8= f8,f8,f0 +(p0) br.ret.sptk b0 ;; +} + +L(LOGB_ZERO): +{.mfi + nop.m 0 +(p0) frcpa.s0 f10,p6 = f1,f0 + nop.i 0 +};; +{.mfi + mov GR_Parameter_TAG = 150 +(p0) fms.s1 f10 = f0,f0,f10 + nop.i 0 +};; +// X ZERO +// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support +.endp logbl +ASM_SIZE_DIRECTIVE(logbl) + +.proc __libm_error_region +__libm_error_region: +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address +} +{ .mib + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_matherrf.c b/sysdeps/ia64/fpu/s_matherrf.c new file mode 100644 index 0000000000..4b3033ecc3 --- /dev/null +++ b/sysdeps/ia64/fpu/s_matherrf.c @@ -0,0 +1,33 @@ +/* Derived from: */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + + +#include "math.h" +#include "math_private.h" +#include "libm_support.h" + +#ifdef __STDC__ + int + weak_function + __matherrf(struct exceptionf *x) +#else + int + weak_function + __matherrf(x) + struct exceptionf *x; +#endif +{ + int n=0; + if(x->arg1!=x->arg1) return 0; + return n; +} +weak_alias (__matherrf, matherrf) diff --git a/sysdeps/ia64/fpu/s_matherrl.c b/sysdeps/ia64/fpu/s_matherrl.c new file mode 100644 index 0000000000..751cc6b51e --- /dev/null +++ b/sysdeps/ia64/fpu/s_matherrl.c @@ -0,0 +1,33 @@ +/* Derived from: */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + + +#include "math.h" +#include "math_private.h" +#include "libm_support.h" + +#ifdef __STDC__ + int + weak_function + __matherrl(struct exceptionl *x) +#else + int + weak_function + __matherrl(x) + struct exceptionl *x; +#endif +{ + int n=0; + if(x->arg1!=x->arg1) return 0; + return n; +} +weak_alias (__matherrl, matherrl) diff --git a/sysdeps/ia64/fpu/s_modf.S b/sysdeps/ia64/fpu/s_modf.S new file mode 100644 index 0000000000..0bfad13763 --- /dev/null +++ b/sysdeps/ia64/fpu/s_modf.S @@ -0,0 +1,272 @@ +.file "modf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00: Improved speed, corrected result for NaN input +// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for +// qnans nor for inputs larger than 2^63. +// +// API +//============================================================== +// double modf(double x, double *iptr) +// break a floating point x number into fraction and an exponent +// +// input floating point f8, address in r33 +// output floating point f8 (x fraction), and *iptr (x integral part) +// +// OVERVIEW +//============================================================== +// +// NO FRACTIONAL PART: HUGE +// If +// for double-extended +// If the true exponent is greater than or equal 63 +// 1003e ==> 1003e -ffff = 3f = 63(dec) +// for double +// If the true exponent is greater than or equal 52 +// 10033 -ffff = 34 = 52(dec) +// for single +// If the true exponent is greater than or equal 23 +// 10016 -ffff = 17 = 23(dec) +// then +// we are already an integer (p9 true) + +// NO INTEGER PART: SMALL +// Is f8 exponent less than register bias (that is, is it +// less than 1). If it is, get the right sign of +// zero and store this in iptr. + +// CALCULATION: NOT HUGE, NOT SMALL +// To get the integer part +// Take the floating-point input and truncate +// then convert this integer to fp Call it MODF_INTEGER_PART + +// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part +// Then put fraction part in f8 +// put integer part MODF_INTEGER_PART into *iptr + +// Registers used +//============================================================== + +// predicate registers used: +// p6 - p13 + +// 0xFFFF 0x10033 +// -----------------------+-----------------+------------- +// SMALL | NORMAL | HUGE +// p11 --------------->|<----- p12 ----->| <-------------- p9 +// p10 --------------------------------->| +// p13 --------------------------------------------------->| +// + +#include "libm_support.h" + +// floating-point registers used: +MODF_NORM_F8 = f9 +MODF_FRACTION_PART = f10 +MODF_INTEGER_PART = f11 +MODF_INT_INTEGER_PART = f12 + + +// general registers used +modf_signexp = r14 +modf_GR_no_frac = r15 +modf_GR_FFFF = r16 +modf_17_ones = r17 +modf_exp = r18 +// r33 = iptr + + +.align 32 +.global modf# + +.section .text +.proc modf# +.align 32 + + +// Main path is p9, p11, p8 FALSE and p12 TRUE + +// Assume input is normalized and get signexp +// Normalize input just in case +// Form exponent bias +modf: +{ .mfi + getf.exp modf_signexp = f8 + fnorm MODF_NORM_F8 = f8 + addl modf_GR_FFFF = 0xffff, r0 +} +// Get integer part of input +// Form exponent mask +{ .mfi + nop.m 999 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = f8 + mov modf_17_ones = 0x1ffff ;; +} + +// Is x nan or inf? +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 = 0xe3 NAN_INF +// Form biased exponent where input only has an integer part +{ .mfi + nop.m 999 + fclass.m.unc p6,p13 = f8, 0xe3 + addl modf_GR_no_frac = 0x10033, r0 ;; +} + +// Mask to get exponent +// Is x unnorm? +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 = 0x0b UNORM +// Set p13 to indicate calculation path, else p6 if nan or inf +{ .mfi + and modf_exp = modf_17_ones, modf_signexp + fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p11 <== SMALL, no integer part, fraction is everyting +// p9 <== HUGE, no fraction part, integer is everything +// p12 <== NORMAL, fraction part and integer part +{ .mii +(p13) cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 + nop.i 999 ;; +} + +// Is x inf? p6 if inf, p7 if nan +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac +(p6) fclass.m.unc p6,p7 = f8, 0x23 +(p8) br.cond.spnt L(MODF_DENORM) ;; +} + +L(MODF_COMMON): +// For HUGE set fraction to signed 0 +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8,f0 + nop.i 999 +} +// For HUGE set integer part to normalized input +{ .mfi + nop.m 999 +(p9) fnorm.d MODF_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// For SMALL set fraction to normalized input, integer part to signed 0 +{ .mfi + nop.m 999 +(p11) fmerge.s MODF_INTEGER_PART = f8,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fnorm.d f8 = MODF_NORM_F8 + nop.i 999 ;; +} + +// For NORMAL float the integer part +{ .mfi + nop.m 999 +(p12) fcvt.xf MODF_INTEGER_PART = MODF_INT_INTEGER_PART + nop.i 999 ;; +} + +// If x inf set integer part to INF, fraction to signed 0 +{ .mfi +(p6) stfd [r33] = MODF_NORM_F8 +(p6) fmerge.s f8 = f8,f0 + nop.i 999 ;; +} + +// If x nan set integer and fraction parts to NaN (quietized) +{ .mfi +(p7) stfd [r33] = MODF_NORM_F8 +(p7) fmerge.s f8 = MODF_NORM_F8, MODF_NORM_F8 + nop.i 999 ;; +} + +{ .mmi +(p9) stfd [r33] = MODF_INTEGER_PART + nop.m 999 + nop.i 999 ;; +} + +// For NORMAL compute fraction part +{ .mfi +(p11) stfd [r33] = MODF_INTEGER_PART +(p12) fms.d.s0 f8 = MODF_NORM_F8,f1, MODF_INTEGER_PART + nop.i 999 ;; +} + +// For NORMAL test if fraction part is zero; if so append correct sign +{ .mfi + nop.m 999 +(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART + nop.i 999 ;; +} + +{ .mfi +(p12) stfd [r33] = MODF_INTEGER_PART + nop.f 999 + nop.i 999 ;; +} + +// For NORMAL if fraction part is zero append sign of input +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = MODF_NORM_F8, f0 + br.ret.sptk b0 ;; +} + +L(MODF_DENORM): +// If x unorm get signexp from normalized input +// If x unorm get integer part from normalized input +{ .mfi + getf.exp modf_signexp = MODF_NORM_F8 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// If x unorm mask to get exponent +{ .mmi + and modf_exp = modf_17_ones, modf_signexp ;; + cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 ;; +} + +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac + nop.f 999 + br.cond.spnt L(MODF_COMMON) ;; +} + +.endp modf +ASM_SIZE_DIRECTIVE(modf) diff --git a/sysdeps/ia64/fpu/s_modff.S b/sysdeps/ia64/fpu/s_modff.S new file mode 100644 index 0000000000..e56a07c079 --- /dev/null +++ b/sysdeps/ia64/fpu/s_modff.S @@ -0,0 +1,272 @@ +.file "modff.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00: Improved speed, corrected result for NaN input +// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for +// qnans nor for inputs larger than 2^63. +// +// API +//============================================================== +// float modff(float x, float *iptr) +// break a floating point x number into fraction and an exponent +// +// input floating point f8, address in r33 +// output floating point f8 (x fraction), and *iptr (x integral part) +// +// OVERVIEW +//============================================================== + +// NO FRACTIONAL PART: HUGE +// If +// for double-extended +// If the true exponent is greater than or equal 63 +// 1003e ==> 1003e -ffff = 3f = 63(dec) +// for double +// If the true exponent is greater than or equal 52 +// 10033 -ffff = 34 = 52(dec) +// for single +// If the true exponent is greater than or equal 23 +// 10016 -ffff = 17 = 23(dec) +// then +// we are already an integer (p9 true) + +// NO INTEGER PART: SMALL +// Is f8 exponent less than register bias (that is, is it +// less than 1). If it is, get the right sign of +// zero and store this in iptr. + +// CALCULATION: NOT HUGE, NOT SMALL +// To get the integer part +// Take the floating-point input and truncate +// then convert this integer to fp Call it MODF_INTEGER_PART + +// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part +// Then put fraction part in f8 +// put integer part MODF_INTEGER_PART into *iptr + +// Registers used +//============================================================== + +// predicate registers used: +// p6 - p13 + +// 0xFFFF 0x10016 +// -----------------------+-----------------+------------- +// SMALL | NORMAL | HUGE +// p11 --------------->|<----- p12 ----->| <-------------- p9 +// p10 --------------------------------->| +// p13 --------------------------------------------------->| +// + +#include "libm_support.h" + +// floating-point registers used: +MODF_NORM_F8 = f9 +MODF_FRACTION_PART = f10 +MODF_INTEGER_PART = f11 +MODF_INT_INTEGER_PART = f12 + + +// general registers used +modf_signexp = r14 +modf_GR_no_frac = r15 +modf_GR_FFFF = r16 +modf_17_ones = r17 +modf_exp = r18 +// r33 = iptr + + +.align 32 +.global modff# + +.section .text +.proc modff# +.align 32 + + +// Main path is p9, p11, p8 FALSE and p12 TRUE + +// Assume input is normalized and get signexp +// Normalize input just in case +// Form exponent bias +modff: +{ .mfi + getf.exp modf_signexp = f8 + fnorm MODF_NORM_F8 = f8 + addl modf_GR_FFFF = 0xffff, r0 +} +// Get integer part of input +// Form exponent mask +{ .mfi + nop.m 999 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = f8 + mov modf_17_ones = 0x1ffff ;; +} + +// Is x nan or inf? +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 = 0xe3 NAN_INF +// Form biased exponent where input only has an integer part +{ .mfi + nop.m 999 + fclass.m.unc p6,p13 = f8, 0xe3 + addl modf_GR_no_frac = 0x10016, r0 ;; +} + +// Mask to get exponent +// Is x unnorm? +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 = 0x0b UNORM +// Set p13 to indicate calculation path, else p6 if nan or inf +{ .mfi + and modf_exp = modf_17_ones, modf_signexp + fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p11 <== SMALL, no integer part, fraction is everyting +// p9 <== HUGE, no fraction part, integer is everything +// p12 <== NORMAL, fraction part and integer part +{ .mii +(p13) cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 + nop.i 999 ;; +} + +// Is x inf? p6 if inf, p7 if nan +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac +(p6) fclass.m.unc p6,p7 = f8, 0x23 +(p8) br.cond.spnt L(MODF_DENORM) ;; +} + +L(MODF_COMMON): +// For HUGE set fraction to signed 0 +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8,f0 + nop.i 999 +} +// For HUGE set integer part to normalized input +{ .mfi + nop.m 999 +(p9) fnorm.s MODF_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// For SMALL set fraction to normalized input, integer part to signed 0 +{ .mfi + nop.m 999 +(p11) fmerge.s MODF_INTEGER_PART = f8,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fnorm.s f8 = MODF_NORM_F8 + nop.i 999 ;; +} + +// For NORMAL float the integer part +{ .mfi + nop.m 999 +(p12) fcvt.xf MODF_INTEGER_PART = MODF_INT_INTEGER_PART + nop.i 999 ;; +} + +// If x inf set integer part to INF, fraction to signed 0 +{ .mfi +(p6) stfs [r33] = MODF_NORM_F8 +(p6) fmerge.s f8 = f8,f0 + nop.i 999 ;; +} + +// If x nan set integer and fraction parts to NaN (quietized) +{ .mfi +(p7) stfs [r33] = MODF_NORM_F8 +(p7) fmerge.s f8 = MODF_NORM_F8, MODF_NORM_F8 + nop.i 999 ;; +} + +{ .mmi +(p9) stfs [r33] = MODF_INTEGER_PART + nop.m 999 + nop.i 999 ;; +} + +// For NORMAL compute fraction part +{ .mfi +(p11) stfs [r33] = MODF_INTEGER_PART +(p12) fms.s.s0 f8 = MODF_NORM_F8,f1, MODF_INTEGER_PART + nop.i 999 ;; +} + +// For NORMAL test if fraction part is zero; if so append correct sign +{ .mfi + nop.m 999 +(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART + nop.i 999 ;; +} + +{ .mfi +(p12) stfs [r33] = MODF_INTEGER_PART + nop.f 999 + nop.i 999 ;; +} + +// For NORMAL if fraction part is zero append sign of input +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = MODF_NORM_F8, f0 + br.ret.sptk b0 ;; +} + +L(MODF_DENORM): +// If x unorm get signexp from normalized input +// If x unorm get integer part from normalized input +{ .mfi + getf.exp modf_signexp = MODF_NORM_F8 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// If x unorm mask to get exponent +{ .mmi + and modf_exp = modf_17_ones, modf_signexp ;; + cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 ;; +} + +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac + nop.f 999 + br.cond.spnt L(MODF_COMMON) ;; +} + +.endp modff +ASM_SIZE_DIRECTIVE(modff) diff --git a/sysdeps/ia64/fpu/s_modfl.S b/sysdeps/ia64/fpu/s_modfl.S new file mode 100644 index 0000000000..e15508ba61 --- /dev/null +++ b/sysdeps/ia64/fpu/s_modfl.S @@ -0,0 +1,267 @@ +.file "modfl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00: Improved speed, corrected result for NaN input +// 5/30/00 Fixed bug for exponent 0x1003e +// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for +// qnans nor for inputs larger than 2^63. +// +// API +//============================================================== +// long double modfl(long double x, long double *iptr) +// break a floating point x number into fraction and an exponent +// +// input floating point f8, address in r34 +// output floating point f8 (x fraction), and *iptr (x integral part) +// +// OVERVIEW +//============================================================== +// +// NO FRACTIONAL PART: HUGE +// If +// for double-extended +// If the true exponent is >= 63 +// 1003e ==> 1003e -ffff = 3f = 63(dec) +// then +// we are already an integer (p9 true) + +// NO INTEGER PART: SMALL +// Is f8 exponent less than register bias (that is, is it +// less than 1). If it is, get the right sign of +// zero and store this in iptr. + +// CALCULATION: NOT HUGE, NOT SMALL +// To get the integer part +// Take the floating-point input and truncate +// then convert this integer to fp Call it MODF_INTEGER_PART + +// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part +// Then put fraction part in f8 +// put integer part MODF_INTEGER_PART into *iptr + +// Registers used +//============================================================== + +// predicate registers used: +// p6 - p13 + +// 0xFFFF 0x1003e +// -----------------------+-----------------+------------- +// SMALL | NORMAL | HUGE +// p11 --------------->|<----- p12 ----->| <-------------- p9 +// p10 --------------------------------->| +// p13 --------------------------------------------------->| +// + +#include "libm_support.h" + +// floating-point registers used: +MODF_NORM_F8 = f9 +MODF_FRACTION_PART = f10 +MODF_INTEGER_PART = f11 +MODF_INT_INTEGER_PART = f12 + + +// general registers used +modf_signexp = r14 +modf_GR_no_frac = r15 +modf_GR_FFFF = r16 +modf_17_ones = r17 +modf_exp = r18 +// r34 = iptr + + +.align 32 +.global modfl# + +.section .text +.proc modfl# +.align 32 + + +// Main path is p9, p11, p8 FALSE and p12 TRUE + +// Assume input is normalized and get signexp +// Normalize input just in case +// Form exponent bias +modfl: +{ .mfi + getf.exp modf_signexp = f8 + fnorm MODF_NORM_F8 = f8 + addl modf_GR_FFFF = 0xffff, r0 +} +// Get integer part of input +// Form exponent mask +{ .mfi + nop.m 999 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = f8 + mov modf_17_ones = 0x1ffff ;; +} + +// Is x nan or inf? +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 0 11 = 0xe3 NAN_INF +// Form biased exponent where input only has an integer part +{ .mfi + nop.m 999 + fclass.m.unc p6,p13 = f8, 0xe3 + addl modf_GR_no_frac = 0x1003e, r0 ;; +} + +// Mask to get exponent +// Is x unnorm? +// qnan snan inf norm unorm 0 -+ +// 0 0 0 0 1 0 11 = 0x0b UNORM +// Set p13 to indicate calculation path, else p6 if nan or inf +{ .mfi + and modf_exp = modf_17_ones, modf_signexp + fclass.m.unc p8,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p11 <== SMALL, no integer part, fraction is everyting +// p9 <== HUGE, no fraction part, integer is everything +// p12 <== NORMAL, fraction part and integer part +{ .mii +(p13) cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 + nop.i 999 ;; +} + +// Is x inf? p6 if inf, p7 if nan +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac +(p6) fclass.m.unc p6,p7 = f8, 0x23 +(p8) br.cond.spnt L(MODF_DENORM) ;; +} + +L(MODF_COMMON): +// For HUGE set fraction to signed 0 +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8,f0 + nop.i 999 +} +// For HUGE set integer part to normalized input +{ .mfi + nop.m 999 +(p9) fnorm MODF_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// For SMALL set fraction to normalized input, integer part to signed 0 +{ .mfi + nop.m 999 +(p11) fmerge.s MODF_INTEGER_PART = f8,f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fnorm f8 = MODF_NORM_F8 + nop.i 999 ;; +} + +// For NORMAL float the integer part +{ .mfi + nop.m 999 +(p12) fcvt.xf MODF_INTEGER_PART = MODF_INT_INTEGER_PART + nop.i 999 ;; +} + +// If x inf set integer part to INF, fraction to signed 0 +{ .mfi +(p6) stfe [r34] = MODF_NORM_F8 +(p6) fmerge.s f8 = f8,f0 + nop.i 999 ;; +} + +// If x nan set integer and fraction parts to NaN (quietized) +{ .mfi +(p7) stfe [r34] = MODF_NORM_F8 +(p7) fmerge.s f8 = MODF_NORM_F8, MODF_NORM_F8 + nop.i 999 ;; +} + +{ .mmi +(p9) stfe [r34] = MODF_INTEGER_PART + nop.m 999 + nop.i 999 ;; +} + +// For NORMAL compute fraction part +{ .mfi +(p11) stfe [r34] = MODF_INTEGER_PART +(p12) fms.s0 f8 = MODF_NORM_F8,f1, MODF_INTEGER_PART + nop.i 999 ;; +} + +// For NORMAL test if fraction part is zero; if so append correct sign +{ .mfi + nop.m 999 +(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART + nop.i 999 ;; +} + +{ .mfi +(p12) stfe [r34] = MODF_INTEGER_PART + nop.f 999 + nop.i 999 ;; +} + +// For NORMAL if fraction part is zero append sign of input +{ .mfb + nop.m 999 +(p7) fmerge.s f8 = MODF_NORM_F8, f0 + br.ret.sptk b0 ;; +} + +L(MODF_DENORM): +// If x unorm get signexp from normalized input +// If x unorm get integer part from normalized input +{ .mfi + getf.exp modf_signexp = MODF_NORM_F8 + fcvt.fx.trunc.s1 MODF_INT_INTEGER_PART = MODF_NORM_F8 + nop.i 999 ;; +} + +// If x unorm mask to get exponent +{ .mmi + and modf_exp = modf_17_ones, modf_signexp ;; + cmp.lt.unc p11,p10 = modf_exp, modf_GR_FFFF + nop.i 999 ;; +} + +{ .mfb +(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac + nop.f 999 + br.cond.spnt L(MODF_COMMON) ;; +} + +.endp modfl +ASM_SIZE_DIRECTIVE(modfl) diff --git a/sysdeps/ia64/fpu/s_nearbyint.S b/sysdeps/ia64/fpu/s_nearbyint.S new file mode 100644 index 0000000000..8c7e4a9ec9 --- /dev/null +++ b/sysdeps/ia64/fpu/s_nearbyint.S @@ -0,0 +1,221 @@ +.file "nearbyint.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/19/2000: Created +// 2/08/01 Corrected behavior for all rounding modes. +//============================================================== +// +// API +//============================================================== +// double nearbyint(double x) + +#include "libm_support.h" + +// +// general registers used: +// + +nearbyint_GR_signexp = r14 +nearbyint_GR_exponent = r15 +nearbyint_GR_17ones = r16 +nearbyint_GR_10033 = r17 +nearbyint_GR_fpsr = r18 +nearbyint_GR_rcs0 = r19 +nearbyint_GR_rcs0_mask = r20 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +NEARBYINT_NORM_f8 = f9 +NEARBYINT_FLOAT_INT_f8 = f10 +NEARBYINT_INT_f8 = f11 + +// Overview of operation +//============================================================== + +// double nearbyint(double x) +// Return an integer value (represented as a double) that is x rounded to integer in current +// rounding mode +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global nearbyint# + +.section .text +.proc nearbyint# +.align 32 + + +nearbyint: + +{ .mfi + mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 NEARBYINT_INT_f8 = f8 + addl nearbyint_GR_10033 = 0x10033, r0 +} +{ .mfi + nop.m 999 + fnorm.s1 NEARBYINT_NORM_f8 = f8 + mov nearbyint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = f8, 0xe7 + mov nearbyint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm.d f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp nearbyint_GR_signexp = NEARBYINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and nearbyint_GR_exponent = nearbyint_GR_signexp, nearbyint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = nearbyint_GR_exponent, nearbyint_GR_10033 + and nearbyint_GR_rcs0 = nearbyint_GR_rcs0_mask, nearbyint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +L(NEARBYINT_COMMON): +{ .mfb + cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 +(p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0 +(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p7) fnorm.d.s0 f8 = f8 + nop.i 999 +;; +} + +// If result is zero, merge sign of input +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8, NEARBYINT_FLOAT_INT_f8 + nop.i 999 +} +{ .mfb + nop.m 999 +(p10) fnorm.d f8 = NEARBYINT_FLOAT_INT_f8 + br.ret.sptk b0 +;; +} + + +L(NEARBYINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 NEARBYINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + br.cond.sptk L(NEARBYINT_COMMON) +;; +} + + +.endp nearbyint +ASM_SIZE_DIRECTIVE(nearbyint) diff --git a/sysdeps/ia64/fpu/s_nearbyintf.S b/sysdeps/ia64/fpu/s_nearbyintf.S new file mode 100644 index 0000000000..02806e34dc --- /dev/null +++ b/sysdeps/ia64/fpu/s_nearbyintf.S @@ -0,0 +1,221 @@ +.file "nearbyintf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/19/2000: Created +// 2/08/01 Corrected behavior for all rounding modes. +//============================================================== +// +// API +//============================================================== +// float nearbyintf(float x) + +#include "libm_support.h" + +// +// general registers used: +// + +nearbyint_GR_signexp = r14 +nearbyint_GR_exponent = r15 +nearbyint_GR_17ones = r16 +nearbyint_GR_10033 = r17 +nearbyint_GR_fpsr = r18 +nearbyint_GR_rcs0 = r19 +nearbyint_GR_rcs0_mask = r20 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +NEARBYINT_NORM_f8 = f9 +NEARBYINT_FLOAT_INT_f8 = f10 +NEARBYINT_INT_f8 = f11 + +// Overview of operation +//============================================================== + +// float nearbyintf(float x) +// Return an integer value (represented as a float) that is x rounded to integer in current +// rounding mode +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global nearbyintf# + +.section .text +.proc nearbyintf# +.align 32 + + +nearbyintf: + +{ .mfi + mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 NEARBYINT_INT_f8 = f8 + addl nearbyint_GR_10033 = 0x10016, r0 +} +{ .mfi + nop.m 999 + fnorm.s1 NEARBYINT_NORM_f8 = f8 + mov nearbyint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = f8, 0xe7 + mov nearbyint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm.s f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp nearbyint_GR_signexp = NEARBYINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and nearbyint_GR_exponent = nearbyint_GR_signexp, nearbyint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = nearbyint_GR_exponent, nearbyint_GR_10033 + and nearbyint_GR_rcs0 = nearbyint_GR_rcs0_mask, nearbyint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +L(NEARBYINT_COMMON): +{ .mfb + cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 +(p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0 +(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p7) fnorm.s.s0 f8 = f8 + nop.i 999 +;; +} + +// If result is zero, merge sign of input +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8, NEARBYINT_FLOAT_INT_f8 + nop.i 999 +} +{ .mfb + nop.m 999 +(p10) fnorm.s f8 = NEARBYINT_FLOAT_INT_f8 + br.ret.sptk b0 +;; +} + + +L(NEARBYINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 NEARBYINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + br.cond.sptk L(NEARBYINT_COMMON) +;; +} + + +.endp nearbyintf +ASM_SIZE_DIRECTIVE(nearbyintf) diff --git a/sysdeps/ia64/fpu/s_nearbyintl.S b/sysdeps/ia64/fpu/s_nearbyintl.S new file mode 100644 index 0000000000..df935d500e --- /dev/null +++ b/sysdeps/ia64/fpu/s_nearbyintl.S @@ -0,0 +1,218 @@ +.file "nearbyintl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/19/2000: Created +// 2/08/01 Corrected behavior for all rounding modes. +//============================================================== +// +// API +//============================================================== +// long double nearbyintl(long double x) + +#include "libm_support.h" + +// +// general registers used: +// + +nearbyint_GR_signexp = r14 +nearbyint_GR_exponent = r15 +nearbyint_GR_17ones = r16 +nearbyint_GR_10033 = r17 +nearbyint_GR_fpsr = r18 +nearbyint_GR_rcs0 = r19 +nearbyint_GR_rcs0_mask = r20 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +NEARBYINT_NORM_f8 = f9 +NEARBYINT_FLOAT_INT_f8 = f10 +NEARBYINT_INT_f8 = f11 +NEARBYINT_SIGNED_FLOAT_INT_f8 = f12 + +// Overview of operation +//============================================================== + +// long double nearbyintl(long double x) +// Return an integer value (represented as a long double) that is +// x rounded to integer in current rounding mode +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global nearbyintl# + +.section .text +.proc nearbyintl# +.align 32 + + +nearbyintl: + +{ .mfi + mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 NEARBYINT_INT_f8 = f8 + addl nearbyint_GR_10033 = 0x1003e, r0 +} +{ .mfi + nop.m 999 + fnorm.s1 NEARBYINT_NORM_f8 = f8 + mov nearbyint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + nop.m 999 + fclass.m.unc p6,p0 = f8, 0xe7 + mov nearbyint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp nearbyint_GR_signexp = NEARBYINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and nearbyint_GR_exponent = nearbyint_GR_signexp, nearbyint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = nearbyint_GR_exponent, nearbyint_GR_10033 + and nearbyint_GR_rcs0 = nearbyint_GR_rcs0_mask, nearbyint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +// Must merge the original sign for cases where the result is zero or the input +// is the largest that still has a fraction (0x1007dfffffffffff) +L(NEARBYINT_COMMON): +{ .mfb + cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 +(p6) fmerge.s NEARBYINT_SIGNED_FLOAT_INT_f8 = f8, NEARBYINT_FLOAT_INT_f8 +(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p7) fnorm.s0 f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm f8 = NEARBYINT_SIGNED_FLOAT_INT_f8 + br.ret.sptk b0 +;; +} + + +L(NEARBYINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 NEARBYINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 + br.cond.sptk L(NEARBYINT_COMMON) +;; +} + + +.endp nearbyintl +ASM_SIZE_DIRECTIVE(nearbyintl) diff --git a/sysdeps/ia64/fpu/s_rint.S b/sysdeps/ia64/fpu/s_rint.S new file mode 100644 index 0000000000..fd99e8ebc8 --- /dev/null +++ b/sysdeps/ia64/fpu/s_rint.S @@ -0,0 +1,241 @@ +.file "rint.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 2/08/01 Corrected behavior for all rounding modes. +// +// API +//============================================================== +// double rint(double x) + +#include "libm_support.h" + +// +// general registers used: +// +rint_GR_FFFF = r14 +rint_GR_signexp = r15 +rint_GR_exponent = r16 +rint_GR_17ones = r17 +rint_GR_10033 = r18 +rint_GR_fpsr = r19 +rint_GR_rcs0 = r20 +rint_GR_rcs0_mask = r21 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +RINT_NORM_f8 = f9 +RINT_FFFF = f10 +RINT_INEXACT = f11 +RINT_FLOAT_INT_f8 = f12 +RINT_INT_f8 = f13 + +// Overview of operation +//============================================================== + +// double rint(double x) +// Return an integer value (represented as a double) that is x rounded to integer in current +// rounding mode +// Inexact is set if x != rint(x) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global rint# + +.section .text +.proc rint# +.align 32 + + +rint: +#ifdef _LIBC +.global __rint +.type __rint,@function +__rint: +#endif + +{ .mfi + mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 RINT_INT_f8 = f8 + addl rint_GR_10033 = 0x10033, r0 +} +{ .mfi + mov rint_GR_FFFF = -1 + fnorm.s1 RINT_NORM_f8 = f8 + mov rint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + setf.sig RINT_FFFF = rint_GR_FFFF + fclass.m.unc p6,p0 = f8, 0xe7 + mov rint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm.d f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp rint_GR_signexp = RINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 + and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +L(RINT_COMMON): +{ .mfb + cmp.ne p11,p0 = rint_GR_rcs0, r0 +(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0 +(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fnorm.d.s0 f8 = f8 + nop.i 999 +;; +} + +// If result is zero, merge sign of input +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fnorm.d f8 = RINT_FLOAT_INT_f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 +(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact + br.ret.sptk b0 +;; +} + +L(RINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov rint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 RINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + br.cond.sptk L(RINT_COMMON) +;; +} + + +.endp rint +ASM_SIZE_DIRECTIVE(rint) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__rint) +#endif diff --git a/sysdeps/ia64/fpu/s_rintf.S b/sysdeps/ia64/fpu/s_rintf.S new file mode 100644 index 0000000000..78742dc81f --- /dev/null +++ b/sysdeps/ia64/fpu/s_rintf.S @@ -0,0 +1,241 @@ +.file "rintf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 2/08/01 Corrected behavior for all rounding modes. +// +// API +//============================================================== +// float rintf(float x) + +#include "libm_support.h" + +// +// general registers used: +// +rint_GR_FFFF = r14 +rint_GR_signexp = r15 +rint_GR_exponent = r16 +rint_GR_17ones = r17 +rint_GR_10033 = r18 +rint_GR_fpsr = r19 +rint_GR_rcs0 = r20 +rint_GR_rcs0_mask = r21 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +RINT_NORM_f8 = f9 +RINT_FFFF = f10 +RINT_INEXACT = f11 +RINT_FLOAT_INT_f8 = f12 +RINT_INT_f8 = f13 + +// Overview of operation +//============================================================== + +// float rintf(float x) +// Return an integer value (represented as a float) that is x rounded to integer in current +// rounding mode +// Inexact is set if x != rintf(x) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global rintf# + +.section .text +.proc rintf# +.align 32 + + +rintf: +#ifdef _LIBC +.global __rintf +.type __rintf,@function +__rintf: +#endif + +{ .mfi + mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 RINT_INT_f8 = f8 + addl rint_GR_10033 = 0x10016, r0 +} +{ .mfi + mov rint_GR_FFFF = -1 + fnorm.s1 RINT_NORM_f8 = f8 + mov rint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + setf.sig RINT_FFFF = rint_GR_FFFF + fclass.m.unc p6,p0 = f8, 0xe7 + mov rint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm.s f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp rint_GR_signexp = RINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 + and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +L(RINT_COMMON): +{ .mfb + cmp.ne p11,p0 = rint_GR_rcs0, r0 +(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0 +(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fnorm.s.s0 f8 = f8 + nop.i 999 +;; +} + +// If result is zero, merge sign of input +{ .mfi + nop.m 999 +(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p10) fnorm.s f8 = RINT_FLOAT_INT_f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 +(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact + br.ret.sptk b0 +;; +} + +L(RINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov rint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 RINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + br.cond.sptk L(RINT_COMMON) +;; +} + + +.endp rintf +ASM_SIZE_DIRECTIVE(rintf) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__rintf) +#endif diff --git a/sysdeps/ia64/fpu/s_rintl.S b/sysdeps/ia64/fpu/s_rintl.S new file mode 100644 index 0000000000..9bf7492d88 --- /dev/null +++ b/sysdeps/ia64/fpu/s_rintl.S @@ -0,0 +1,239 @@ +.file "rintl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 5/24/00 Fixed case of 2^63 - 1 + 0.5 (0x1007dffffffffffffffff) +// 2/08/01 Corrected behavior for all rounding modes. +// +// API +//============================================================== +// long double rintl(long double x) + +#include "libm_support.h" + +// +// general registers used: +// +rint_GR_FFFF = r14 +rint_GR_signexp = r15 +rint_GR_exponent = r16 +rint_GR_17ones = r17 +rint_GR_10033 = r18 +rint_GR_fpsr = r19 +rint_GR_rcs0 = r20 +rint_GR_rcs0_mask = r21 + + +// predicate registers used: +// p6-11 + +// floating-point registers used: + +RINT_NORM_f8 = f9 +RINT_FFFF = f10 +RINT_INEXACT = f11 +RINT_FLOAT_INT_f8 = f12 +RINT_INT_f8 = f13 +RINT_SIGNED_FLOAT_INT_f8 = f14 + +// Overview of operation +//============================================================== + +// long double rintl(long double x) +// Return an integer value (represented as a long double) that is x rounded to integer in current +// rounding mode +// Inexact is set if x != rintl(x) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// Is the input an integer value already? + +// double_extended +// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is >= 10016 => 17(true) = 23(decimal) +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// If x is NAN, ZERO, or INFINITY, then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global rintl# + +.section .text +.proc rintl# +.align 32 + + +rintl: +#ifdef _LIBC +.global __rintl +.type __rintl,@function +__rintl: +#endif + +{ .mfi + mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 + fcvt.fx.s1 RINT_INT_f8 = f8 + addl rint_GR_10033 = 0x1003e, r0 +} +{ .mfi + mov rint_GR_FFFF = -1 + fnorm.s1 RINT_NORM_f8 = f8 + mov rint_GR_17ones = 0x1FFFF +;; +} + +{ .mfi + setf.sig RINT_FFFF = rint_GR_FFFF + fclass.m.unc p6,p0 = f8, 0xe7 + mov rint_GR_rcs0_mask = 0x0c00 +;; +} + +{ .mfb + nop.m 999 +(p6) fnorm f8 = f8 +(p6) br.ret.spnt b0 // Exit if x nan, inf, zero +;; +} + +{ .mfi + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + nop.i 999 +;; +} + +{ .mfi + getf.exp rint_GR_signexp = RINT_NORM_f8 + fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal + nop.i 999 +;; +} + + +{ .mii + nop.m 999 + nop.i 999 + and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones +;; +} + +{ .mmi + cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 + and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr + nop.i 999 +;; +} + +// Check to see if s0 rounding mode is round to nearest. If not then set s2 +// rounding mode to that of s0 and repeat conversions. +// Must merge the original sign for cases where the result is zero or the input +// is the largest that still has a fraction (0x1007dfffffffffff) +L(RINT_COMMON): +{ .mfb + cmp.ne p11,p0 = rint_GR_rcs0, r0 +(p6) fmerge.s RINT_SIGNED_FLOAT_INT_f8 = f8, RINT_FLOAT_INT_f8 +(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +;; +} + +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p7) fnorm.s0 f8 = f8 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 +(p6) fnorm f8 = RINT_SIGNED_FLOAT_INT_f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 +(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact + br.ret.sptk b0 +;; +} + +L(RINT_NOT_ROUND_NEAREST): +// Set rounding mode of s2 to that of s0 +{ .mfi + mov rint_GR_rcs0 = r0 // Clear so we don't come back here + fsetc.s2 0x7f, 0x40 + nop.i 999 +;; +} + +{ .mfi + nop.m 999 + fcvt.fx.s2 RINT_INT_f8 = f8 + nop.i 999 +;; +} + +{ .mfb + nop.m 999 + fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 + br.cond.sptk L(RINT_COMMON) +;; +} + + +.endp rintl +ASM_SIZE_DIRECTIVE(rintl) +#ifdef _LIBC +ASM_SIZE_DIRECTIVE(__rintl) +#endif diff --git a/sysdeps/ia64/fpu/s_round.S b/sysdeps/ia64/fpu/s_round.S new file mode 100644 index 0000000000..30e8af8c02 --- /dev/null +++ b/sysdeps/ia64/fpu/s_round.S @@ -0,0 +1,236 @@ +.file "round.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/25/2000: Created +//============================================================== +// +// API +//============================================================== +// double round(double x) +// + +#include "libm_support.h" + +// general input registers: +// +round_GR_half = r14 +round_GR_big = r15 +round_GR_expmask = r16 +round_GR_signexp = r17 +round_GR_exp = r18 +round_GR_expdiff = r19 + +// predicate registers used: +// p6 - p10 + +// floating-point registers used: + +ROUND_NORM_f8 = f9 +ROUND_TRUNC_f8 = f10 +ROUND_RINT_f8 = f11 +ROUND_FLOAT_TRUNC_f8 = f12 +ROUND_FLOAT_RINT_f8 = f13 +ROUND_REMAINDER = f14 +ROUND_HALF = f15 + +// Overview of operation +//============================================================== + +// double round(double x) +// Return an integer value (represented as a double) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. +// if x>0 result = trunc(x+0.5) +// if x<0 result = trunc(x-0.5) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// If x is NAN, ZERO, INFINITY, or >= 2^52 then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global round# + +.section .text +.proc round# +.align 32 + + +round: + +// Get exponent for +0.5 +// Truncate x to integer +{ .mfi + addl round_GR_half = 0x0fffe, r0 + fcvt.fx.trunc.s1 ROUND_TRUNC_f8 = f8 + nop.i 999 +} + +// Get signexp of x +// Normalize input +// Form exponent mask +{ .mfi + getf.exp round_GR_signexp = f8 + fnorm ROUND_NORM_f8 = f8 + addl round_GR_expmask = 0x1ffff, r0 ;; +} + +// Form +0.5 +// Round x to integer +{ .mfi + setf.exp ROUND_HALF = round_GR_half + fcvt.fx.s1 ROUND_RINT_f8 = f8 + nop.i 999 ;; +} +// Get exp of x +// Test for NAN, INF, ZERO +// Get exponent at which input has no fractional part +{ .mfi + and round_GR_exp = round_GR_expmask, round_GR_signexp + fclass.m p8,p9 = f8,0xe7 + addl round_GR_big = 0x10033, r0 ;; +} + +// Get exp-bigexp +// If exp is so big there is no fractional part, then turn on p8, off p9 +{ .mmi + sub round_GR_expdiff = round_GR_exp, round_GR_big ;; +#ifdef _LIBC +(p9) cmp.lt.or.andcm p8,p9 = r0, round_GR_expdiff +#else +(p9) cmp.ge.or.andcm p8,p9 = round_GR_expdiff, r0 +#endif + nop.i 999 ;; +} + +// Set p6 if x<0, else set p7 +{ .mfi + nop.m 999 +(p9) fcmp.lt.unc p6,p7 = f8,f0 + nop.i 999 +} + +// If NAN, INF, ZERO, or no fractional part, result is just normalized input +{ .mfi + nop.m 999 +(p8) fnorm.d.s0 f8 = f8 + nop.i 999 ;; +} + +// Float the truncated integer +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUND_FLOAT_TRUNC_f8 = ROUND_TRUNC_f8 + nop.i 999 ;; +} + +// Float the rounded integer to get preliminary result +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUND_FLOAT_RINT_f8 = ROUND_RINT_f8 + nop.i 999 ;; +} + +// If x<0 and the difference of the truncated input minus the input is 0.5 +// then result = truncated input - 1.0 +// Else if x>0 and the difference of the input minus truncated input is 0.5 +// then result = truncated input + 1.0 +// Else +// result = rounded input +// Endif +{ .mfi + nop.m 999 +(p6) fsub.s1 ROUND_REMAINDER = ROUND_FLOAT_TRUNC_f8, ROUND_NORM_f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fsub.s1 ROUND_REMAINDER = ROUND_NORM_f8, ROUND_FLOAT_TRUNC_f8 + nop.i 999 ;; +} + +// Assume preliminary result is rounded integer +{ .mfi + nop.m 999 +(p9) fnorm.d.s0 f8 = ROUND_FLOAT_RINT_f8 + nop.i 999 +} + +// If x<0, test if result=0 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p10,p0 = ROUND_FLOAT_RINT_f8,f0 + nop.i 999 ;; +} + +// If x<0 and result=0, set result=-0 +{ .mfi + nop.m 999 +(p10) fmerge.ns f8 = f1,f8 + nop.i 999 +} + +// If x<0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p6,p0 = ROUND_REMAINDER, ROUND_HALF + nop.i 999 ;; +} + +// If x>0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p7) fcmp.eq.unc p7,p0 = ROUND_REMAINDER, ROUND_HALF + nop.i 999 ;; +} + +// If x<0 and remainder=0.5, result=truncated-1.0 +// If x>0 and remainder=0.5, result=truncated+1.0 +// Exit +.pred.rel "mutex",p6,p7 +{ .mfi + nop.m 999 +(p6) fsub.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p7) fadd.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1 + br.ret.sptk b0 ;; +} + +.endp round +ASM_SIZE_DIRECTIVE(round) diff --git a/sysdeps/ia64/fpu/s_roundf.S b/sysdeps/ia64/fpu/s_roundf.S new file mode 100644 index 0000000000..9aa0d6c76f --- /dev/null +++ b/sysdeps/ia64/fpu/s_roundf.S @@ -0,0 +1,236 @@ +.file "roundf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/25/2000: Created +//============================================================== +// +// API +//============================================================== +// float roundf(float x) +// + +#include "libm_support.h" + +// general input registers: +// +roundf_GR_half = r14 +roundf_GR_big = r15 +roundf_GR_expmask = r16 +roundf_GR_signexp = r17 +roundf_GR_exp = r18 +roundf_GR_expdiff = r19 + +// predicate registers used: +// p6 - p10 + +// floating-point registers used: + +ROUNDF_NORM_f8 = f9 +ROUNDF_TRUNC_f8 = f10 +ROUNDF_RINT_f8 = f11 +ROUNDF_FLOAT_TRUNC_f8 = f12 +ROUNDF_FLOAT_RINT_f8 = f13 +ROUNDF_REMAINDER = f14 +ROUNDF_HALF = f15 + +// Overview of operation +//============================================================== + +// float roundf(float x) +// Return an integer value (represented as a float) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. +// if x>0 result = trunc(x+0.5) +// if x<0 result = trunc(x-0.5) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// If x is NAN, ZERO, INFINITY, or >= 2^23 then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global roundf# + +.section .text +.proc roundf# +.align 32 + + +roundf: + +// Get exponent for +0.5 +// Truncate x to integer +{ .mfi + addl roundf_GR_half = 0x0fffe, r0 + fcvt.fx.trunc.s1 ROUNDF_TRUNC_f8 = f8 + nop.i 999 +} + +// Get signexp of x +// Normalize input +// Form exponent mask +{ .mfi + getf.exp roundf_GR_signexp = f8 + fnorm ROUNDF_NORM_f8 = f8 + addl roundf_GR_expmask = 0x1ffff, r0 ;; +} + +// Form +0.5 +// Round x to integer +{ .mfi + setf.exp ROUNDF_HALF = roundf_GR_half + fcvt.fx.s1 ROUNDF_RINT_f8 = f8 + nop.i 999 ;; +} +// Get exp of x +// Test for NAN, INF, ZERO +// Get exponent at which input has no fractional part +{ .mfi + and roundf_GR_exp = roundf_GR_expmask, roundf_GR_signexp + fclass.m p8,p9 = f8,0xe7 + addl roundf_GR_big = 0x10016, r0 ;; +} + +// Get exp-bigexp +// If exp is so big there is no fractional part, then turn on p8, off p9 +{ .mmi + sub roundf_GR_expdiff = roundf_GR_exp, roundf_GR_big ;; +#ifdef _LIBC +(p9) cmp.lt.or.andcm p8,p9 = r0, roundf_GR_expdiff +#else +(p9) cmp.ge.or.andcm p8,p9 = roundf_GR_expdiff, r0 +#endif + nop.i 999 ;; +} + +// Set p6 if x<0, else set p7 +{ .mfi + nop.m 999 +(p9) fcmp.lt.unc p6,p7 = f8,f0 + nop.i 999 +} + +// If NAN, INF, ZERO, or no fractional part, result is just normalized input +{ .mfi + nop.m 999 +(p8) fnorm.s.s0 f8 = f8 + nop.i 999 ;; +} + +// Float the truncated integer +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUNDF_FLOAT_TRUNC_f8 = ROUNDF_TRUNC_f8 + nop.i 999 ;; +} + +// Float the rounded integer to get preliminary result +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUNDF_FLOAT_RINT_f8 = ROUNDF_RINT_f8 + nop.i 999 ;; +} + +// If x<0 and the difference of the truncated input minus the input is 0.5 +// then result = truncated input - 1.0 +// Else if x>0 and the difference of the input minus truncated input is 0.5 +// then result = truncated input + 1.0 +// Else +// result = rounded input +// Endif +{ .mfi + nop.m 999 +(p6) fsub.s1 ROUNDF_REMAINDER = ROUNDF_FLOAT_TRUNC_f8, ROUNDF_NORM_f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fsub.s1 ROUNDF_REMAINDER = ROUNDF_NORM_f8, ROUNDF_FLOAT_TRUNC_f8 + nop.i 999 ;; +} + +// Assume preliminary result is rounded integer +{ .mfi + nop.m 999 +(p9) fnorm.s.s0 f8 = ROUNDF_FLOAT_RINT_f8 + nop.i 999 +} + +// If x<0, test if result=0 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p10,p0 = ROUNDF_FLOAT_RINT_f8,f0 + nop.i 999 ;; +} + +// If x<0 and result=0, set result=-0 +{ .mfi + nop.m 999 +(p10) fmerge.ns f8 = f1,f8 + nop.i 999 +} + +// If x<0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p6,p0 = ROUNDF_REMAINDER, ROUNDF_HALF + nop.i 999 ;; +} + +// If x>0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p7) fcmp.eq.unc p7,p0 = ROUNDF_REMAINDER, ROUNDF_HALF + nop.i 999 ;; +} + +// If x<0 and remainder=0.5, result=truncated-1.0 +// If x>0 and remainder=0.5, result=truncated+1.0 +// Exit +.pred.rel "mutex",p6,p7 +{ .mfi + nop.m 999 +(p6) fsub.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p7) fadd.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1 + br.ret.sptk b0 ;; +} + +.endp roundf +ASM_SIZE_DIRECTIVE(roundf) diff --git a/sysdeps/ia64/fpu/s_roundl.S b/sysdeps/ia64/fpu/s_roundl.S new file mode 100644 index 0000000000..f581d2f65a --- /dev/null +++ b/sysdeps/ia64/fpu/s_roundl.S @@ -0,0 +1,236 @@ +.file "roundl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 10/25/2000: Created +//============================================================== +// +// API +//============================================================== +// long double roundl(long double x) +// + +#include "libm_support.h" + +// general input registers: +// +roundl_GR_half = r14 +roundl_GR_big = r15 +roundl_GR_expmask = r16 +roundl_GR_signexp = r17 +roundl_GR_exp = r18 +roundl_GR_expdiff = r19 + +// predicate registers used: +// p6 - p10 + +// floating-point registers used: + +ROUNDL_NORM_f8 = f9 +ROUNDL_TRUNC_f8 = f10 +ROUNDL_RINT_f8 = f11 +ROUNDL_FLOAT_TRUNC_f8 = f12 +ROUNDL_FLOAT_RINT_f8 = f13 +ROUNDL_REMAINDER = f14 +ROUNDL_HALF = f15 + +// Overview of operation +//============================================================== + +// long double roundl(long double x) +// Return an integer value (represented as a long double) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. +// if x>0 result = trunc(x+0.5) +// if x<0 result = trunc(x-0.5) +// ******************************************************************************* + +// Set denormal flag for denormal input and +// and take denormal fault if necessary. + +// If x is NAN, ZERO, INFINITY, or >= 2^63 then return + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 0xe7 + + +.align 32 +.global roundl# + +.section .text +.proc roundl# +.align 32 + + +roundl: + +// Get exponent for +0.5 +// Truncate x to integer +{ .mfi + addl roundl_GR_half = 0x0fffe, r0 + fcvt.fx.trunc.s1 ROUNDL_TRUNC_f8 = f8 + nop.i 999 +} + +// Get signexp of x +// Normalize input +// Form exponent mask +{ .mfi + getf.exp roundl_GR_signexp = f8 + fnorm ROUNDL_NORM_f8 = f8 + addl roundl_GR_expmask = 0x1ffff, r0 ;; +} + +// Form +0.5 +// Round x to integer +{ .mfi + setf.exp ROUNDL_HALF = roundl_GR_half + fcvt.fx.s1 ROUNDL_RINT_f8 = f8 + nop.i 999 ;; +} +// Get exp of x +// Test for NAN, INF, ZERO +// Get exponent at which input has no fractional part +{ .mfi + and roundl_GR_exp = roundl_GR_expmask, roundl_GR_signexp + fclass.m p8,p9 = f8,0xe7 + addl roundl_GR_big = 0x1003e, r0 ;; +} + +// Get exp-bigexp +// If exp is so big there is no fractional part, then turn on p8, off p9 +{ .mmi + sub roundl_GR_expdiff = roundl_GR_exp, roundl_GR_big ;; +#ifdef _LIBC +(p9) cmp.lt.or.andcm p8,p9 = r0, roundl_GR_expdiff +#else +(p9) cmp.ge.or.andcm p8,p9 = roundl_GR_expdiff, r0 +#endif + nop.i 999 ;; +} + +// Set p6 if x<0, else set p7 +{ .mfi + nop.m 999 +(p9) fcmp.lt.unc p6,p7 = f8,f0 + nop.i 999 +} + +// If NAN, INF, ZERO, or no fractional part, result is just normalized input +{ .mfi + nop.m 999 +(p8) fnorm.s0 f8 = f8 + nop.i 999 ;; +} + +// Float the truncated integer +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUNDL_FLOAT_TRUNC_f8 = ROUNDL_TRUNC_f8 + nop.i 999 ;; +} + +// Float the rounded integer to get preliminary result +{ .mfi + nop.m 999 +(p9) fcvt.xf ROUNDL_FLOAT_RINT_f8 = ROUNDL_RINT_f8 + nop.i 999 ;; +} + +// If x<0 and the difference of the truncated input minus the input is 0.5 +// then result = truncated input - 1.0 +// Else if x>0 and the difference of the input minus truncated input is 0.5 +// then result = truncated input + 1.0 +// Else +// result = rounded input +// Endif +{ .mfi + nop.m 999 +(p6) fsub.s1 ROUNDL_REMAINDER = ROUNDL_FLOAT_TRUNC_f8, ROUNDL_NORM_f8 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p7) fsub.s1 ROUNDL_REMAINDER = ROUNDL_NORM_f8, ROUNDL_FLOAT_TRUNC_f8 + nop.i 999 ;; +} + +// Assume preliminary result is rounded integer +{ .mfi + nop.m 999 +(p9) fnorm.s0 f8 = ROUNDL_FLOAT_RINT_f8 + nop.i 999 +} + +// If x<0, test if result=0 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p10,p0 = ROUNDL_FLOAT_RINT_f8,f0 + nop.i 999 ;; +} + +// If x<0 and result=0, set result=-0 +{ .mfi + nop.m 999 +(p10) fmerge.ns f8 = f1,f8 + nop.i 999 +} + +// If x<0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p6) fcmp.eq.unc p6,p0 = ROUNDL_REMAINDER, ROUNDL_HALF + nop.i 999 ;; +} + +// If x>0, test if remainder=0.5 +{ .mfi + nop.m 999 +(p7) fcmp.eq.unc p7,p0 = ROUNDL_REMAINDER, ROUNDL_HALF + nop.i 999 ;; +} + +// If x<0 and remainder=0.5, result=truncated-1.0 +// If x>0 and remainder=0.5, result=truncated+1.0 +// Exit +.pred.rel "mutex",p6,p7 +{ .mfi + nop.m 999 +(p6) fsub.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1 + nop.i 999 +} + +{ .mfb + nop.m 999 +(p7) fadd.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1 + br.ret.sptk b0 ;; +} + +.endp roundl +ASM_SIZE_DIRECTIVE(roundl) diff --git a/sysdeps/ia64/fpu/s_scalbn.S b/sysdeps/ia64/fpu/s_scalbn.S new file mode 100644 index 0000000000..caedffd795 --- /dev/null +++ b/sysdeps/ia64/fpu/s_scalbn.S @@ -0,0 +1,366 @@ +.file "scalbn.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 Scalbn completely reworked and now standalone version +// +// API +//============================================================== +// double = scalbn (double x, int n) +// input floating point f8 and int n (r33) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalbn + +.section .text +.proc scalbn +.align 32 + +scalbn: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,1,2,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r33,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r33 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x00000000000303FF +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x00000000000103FF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 176, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 177, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(SCALBN_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(SCALBN_OVERFLOW) +(p9) br.cond.spnt L(SCALBN_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp scalbn +ASM_SIZE_DIRECTIVE(scalbn) +.proc __libm_error_region +__libm_error_region: + +L(SCALBN_OVERFLOW): +L(SCALBN_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfd [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfd FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(scalbn) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_scalbnf.S b/sysdeps/ia64/fpu/s_scalbnf.S new file mode 100644 index 0000000000..a68e82d2c1 --- /dev/null +++ b/sysdeps/ia64/fpu/s_scalbnf.S @@ -0,0 +1,366 @@ +//.file "scalbnf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 scalbnf completely reworked and now standalone version +// +// API +//============================================================== +// float = scalbnf (float x, int n) +// input floating point f8 and int n (r33) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalbnf + +.section .text +.proc scalbnf +.align 32 + +scalbnf: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,1,2,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r33,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r33 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x000000000003007F +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x000000000001007F +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 178, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 179, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(scalbnf_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(scalbnf_OVERFLOW) +(p9) br.cond.spnt L(scalbnf_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp scalbnf +ASM_SIZE_DIRECTIVE(scalbnf) +.proc __libm_error_region +__libm_error_region: + +L(scalbnf_OVERFLOW): +L(scalbnf_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfs [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfs [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfs FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_scalbnl.S b/sysdeps/ia64/fpu/s_scalbnl.S new file mode 100644 index 0000000000..5f51c02d47 --- /dev/null +++ b/sysdeps/ia64/fpu/s_scalbnl.S @@ -0,0 +1,366 @@ +//.file "scalbnl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00 Initial version +// 1/26/01 scalbnl completely reworked and now standalone version +// +// API +//============================================================== +// double-extended = scalbnl (double-extended x, int n) +// input floating point f8 and int n (r34) +// output floating point f8 +// +// Returns x* 2**n using an fma and detects overflow +// and underflow. +// +// + +#include "libm_support.h" + +FR_Big = f6 +FR_NBig = f7 +FR_Floating_X = f8 +FR_Result = f8 +FR_Result2 = f9 +FR_Result3 = f11 +FR_Norm_X = f12 +FR_Two_N = f14 +FR_Two_to_Big = f15 + +GR_N_Biased = r15 +GR_Big = r16 +GR_NBig = r17 +GR_Scratch = r18 +GR_Scratch1 = r19 +GR_Bias = r20 +GR_N_as_int = r21 + +GR_SAVE_B0 = r32 +GR_SAVE_GP = r33 +GR_SAVE_PFS = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Tag = r38 + +.align 32 +.global scalbnl + +.section .text +.proc scalbnl +.align 32 + +scalbnl: + +// +// Is x NAN, INF, ZERO, +-? +// Build the exponent Bias +// +{ .mfi + alloc r32=ar.pfs,2,1,4,0 + fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero + addl GR_Bias = 0x0FFFF,r0 +} + +// +// Sign extend input +// Is N zero? +// Normalize x +// +{ .mfi + cmp.eq.unc p6,p0 = r34,r0 + fnorm.s1 FR_Norm_X = FR_Floating_X + sxt4 GR_N_as_int = r34 +} +;; + +// +// Normalize x +// Branch and return special values. +// Create -35000 +// Create 35000 +// +{ .mfi + addl GR_Big = 35000,r0 + nop.f 0 + add GR_N_Biased = GR_Bias,GR_N_as_int +} +{ .mfb + addl GR_NBig = -35000,r0 +(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0 +(p7) br.ret.spnt b0 +};; + +// +// Build the exponent Bias +// Return x when N = 0 +// +{ .mfi + setf.exp FR_Two_N = GR_N_Biased + nop.f 0 + addl GR_Scratch1 = 0x063BF,r0 +} +{ .mfb + addl GR_Scratch = 0x019C3F,r0 +(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0 +(p6) br.ret.spnt b0 +};; + +// +// Create 2*big +// Create 2**-big +// Is N > 35000 +// Is N < -35000 +// Raise Denormal operand flag with compare +// Main path, create 2**N +// +{ .mfi + setf.exp FR_NBig = GR_Scratch1 + nop.f 0 + cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big +} +{ .mfi + setf.exp FR_Big = GR_Scratch + fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 + cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig +};; + +// +// Adjust 2**N if N was very small or very large +// +{ .mfi + nop.m 0 +(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch = 0x0000000000033FFF +};; + + +{ .mfi + nop.m 0 +(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 + nop.i 0 +} +{ .mlx + nop.m 999 +(p0) movl GR_Scratch1= 0x0000000000013FFF +};; + +// Set up necessary status fields +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + FZ + TD (Underflows) +// +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x41 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x42 + nop.i 999 +};; + +// +// Do final operation +// +{ .mfi + setf.exp FR_NBig = GR_Scratch + fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; +{ .mfi + setf.exp FR_Big = GR_Scratch1 + fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 + nop.i 999 +};; + +// Check for overflow or underflow. +// Restore s3 +// Restore s2 +// +{ .mfi + nop.m 0 + fsetc.s3 0x7F,0x40 + nop.i 999 +} +{ .mfi + nop.m 0 + fsetc.s2 0x7F,0x40 + nop.i 999 +};; + +// +// Is the result zero? +// +{ .mfi + nop.m 999 + fclass.m.unc p6, p0 = FR_Result3, 0x007 + nop.i 999 +} +{ .mfi + addl GR_Tag = 174, r0 + fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big + nop.i 0 +};; + +// +// Detect masked underflow - Tiny + Inexact Only +// +{ .mfi + nop.m 999 +(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 + nop.i 999 +};; + +// +// Is result bigger the allowed range? +// Branch out for underflow +// +{ .mfb +(p6) addl GR_Tag = 175, r0 +(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig +(p6) br.cond.spnt L(scalbnl_UNDERFLOW) +};; + +// +// Branch out for overflow +// +{ .mbb + nop.m 0 +(p7) br.cond.spnt L(scalbnl_OVERFLOW) +(p9) br.cond.spnt L(scalbnl_OVERFLOW) +};; + +// +// Return from main path. +// +{ .mfb + nop.m 999 + nop.f 0 + br.ret.sptk b0;; +} + +.endp scalbnl +ASM_SIZE_DIRECTIVE(scalbnl) +.proc __libm_error_region +__libm_error_region: + +L(scalbnl_OVERFLOW): +L(scalbnl_UNDERFLOW): + +// +// Get stack address of N +// +.prologue +{ .mfi + add GR_Parameter_Y=-32,sp + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +// +// Adjust sp +// +{ .mfi +.fframe 64 + add sp=-64,sp + nop.f 0 + mov GR_SAVE_GP=gp +};; + +// +// Store N on stack in correct position +// Locate the address of x on stack +// +{ .mmi + st8 [GR_Parameter_Y] = GR_N_as_int,16 + add GR_Parameter_X = 16,sp +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +};; + +// +// Store x on the stack. +// Get address for result on stack. +// +.body +{ .mib + stfe [GR_Parameter_X] = FR_Norm_X + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 +} +{ .mib + stfe [GR_Parameter_Y] = FR_Result + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# +};; + +// +// Get location of result on stack +// +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// +// Get the new result +// +{ .mmi + ldfe FR_Result = [GR_Parameter_RESULT] +.restore sp + add sp = 64,sp + mov b0 = GR_SAVE_B0 +};; + +// +// Restore gp, ar.pfs and return +// +{ .mib + mov gp = GR_SAVE_GP + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +};; + +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_significand.S b/sysdeps/ia64/fpu/s_significand.S new file mode 100644 index 0000000000..0cbfd4256c --- /dev/null +++ b/sysdeps/ia64/fpu/s_significand.S @@ -0,0 +1,147 @@ +.file "significand.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00 Unwind support added +// 5/31/00: Fixed bug when x a double-extended denormal +// +// API +//============================================================== +// double significand(double x) +// +// Overview of operation +//============================================================== +// If x = sig * 2**n with 1 <= sig < 2 +// significand returns sig +// +// predicate registers used: +// p6, p7 +// +// floating-point registers used: +// f8, f9, f10 + +#include "libm_support.h" + +.align 32 +.global significand# + +.section .text +.proc significand# +.align 32 + +significand: + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 + +// f10 gets f8(sign) with f1(exp,significand) +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8,f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Test for denormal input +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p6 = TRUE ==> x is not (nan,inf,0) +// return sign(f8) exp(f1) significand(f8) +// else x is (nan,inf,0) +// return sign(f8) exp(f8) significand(f8), normalized. +{ .mfi + nop.m 999 +(p0) fclass.m.unc p0,p6 = f8, 0xe7 + nop.i 999 ;; +} + +{ .mmb + nop.m 999 + nop.m 999 +(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +} + +{ .mfi + nop.m 999 +(p6) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.d f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +L(SIGNIFICAND_DENORM): +// Here if x denorm +{ .mfi + nop.m 999 +(p0) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Check if fnorm(x) still denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x0b + nop.i 999 ;; +} + +// This will be the final result unless x double-extended denormal +{ .mfi + nop.m 999 +(p0) fnorm.d f8 = f8 + nop.i 999 ;; +} + +// If x double-extended denorm, then significand ok, but must merge in +// correct signexp +{ .mfi + nop.m 999 +(p7) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +// Final normalization if x double-extended denorm +{ .mfb + nop.m 999 +(p7) fnorm.d f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp significand +ASM_SIZE_DIRECTIVE(significand) diff --git a/sysdeps/ia64/fpu/s_significandf.S b/sysdeps/ia64/fpu/s_significandf.S new file mode 100644 index 0000000000..bdabe34dac --- /dev/null +++ b/sysdeps/ia64/fpu/s_significandf.S @@ -0,0 +1,146 @@ +.file "significandf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 2/03/00: Modified to improve speed +// 5/31/00: Fixed bug when x a double-extended denormal +// +// API +//============================================================== +// float significandf(float x) +// Overview of operation +//============================================================== +// If x = sig * 2**n with 1 <= sig < 2 +// significandf returns sig +// +// predicate registers used: +// p6, p7 +// +// floating-point registers used: +// f8, f9, f10 + +#include "libm_support.h" + +.align 32 +.global significandf# + +.section .text +.proc significandf# +.align 32 + +significandf: + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 + +// f10 gets f8(sign) with f1(exp,significand) +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8,f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Test for denormal input +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p6 = TRUE ==> x is not (nan,inf,0) +// return sign(f8) exp(f1) significand(f8) +// else x is (nan,inf,0) +// return sign(f8) exp(f8) significand(f8), normalized. +{ .mfi + nop.m 999 +(p0) fclass.m.unc p0,p6 = f8, 0xe7 + nop.i 999 ;; +} + +{ .mmb + nop.m 999 + nop.m 999 +(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +} + +{ .mfi + nop.m 999 +(p6) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm.s f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +L(SIGNIFICAND_DENORM): +// Here if x denorm +{ .mfi + nop.m 999 +(p0) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Check if fnorm(x) still denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x0b + nop.i 999 ;; +} + +// This will be the final result unless x double-extended denormal +{ .mfi + nop.m 999 +(p0) fnorm.s f8 = f8 + nop.i 999 ;; +} + +// If x double-extended denorm, then significand ok, but must merge in +// correct signexp +{ .mfi + nop.m 999 +(p7) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +// Final normalization if x double-extended denorm +{ .mfb + nop.m 999 +(p7) fnorm.s f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp significandf +ASM_SIZE_DIRECTIVE(significandf) diff --git a/sysdeps/ia64/fpu/s_significandl.S b/sysdeps/ia64/fpu/s_significandl.S new file mode 100644 index 0000000000..5dcda0e6b8 --- /dev/null +++ b/sysdeps/ia64/fpu/s_significandl.S @@ -0,0 +1,147 @@ +.file "significandl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 2/03/00: Modified to improve speed +// 5/31/00: Fixed bug when x a double-extended denormal +// +// API +//============================================================== +// long double significandl(long double x) +// +// Overview of operation +//============================================================== +// If x = sig * 2**n with 1 <= sig < 2 +// significandl returns sig +// +// predicate registers used: +// p6, p7 +// +// floating-point registers used: +// f8, f9, f10 + +#include "libm_support.h" + +.align 32 +.global significandl# + +.section .text +.proc significandl# +.align 32 + +significandl: + +// qnan snan inf norm unorm 0 -+ +// 1 1 1 0 0 1 11 + +// f10 gets f8(sign) with f1(exp,significand) +{ .mfi + nop.m 999 +(p0) fmerge.s f10 = f8,f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnorm f9 = f8 + nop.i 999 ;; +} + +// Test for denormal input +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f8, 0x0b + nop.i 999 ;; +} + +// p6 = TRUE ==> x is not (nan,inf,0) +// return sign(f8) exp(f1) significand(f8) +// else x is (nan,inf,0) +// return sign(f8) exp(f8) significand(f8), normalized. +{ .mfi + nop.m 999 +(p0) fclass.m.unc p0,p6 = f8, 0xe7 + nop.i 999 ;; +} + +{ .mmb + nop.m 999 + nop.m 999 +(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +} + +{ .mfi + nop.m 999 +(p6) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fnorm f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +L(SIGNIFICAND_DENORM): +// Here if x denorm +{ .mfi + nop.m 999 +(p0) fmerge.se f8 = f10,f9 + nop.i 999 ;; +} + +// Check if fnorm(x) still denormal, means x double-extended denormal +{ .mfi + nop.m 999 +(p0) fclass.m.unc p7,p0 = f9, 0x0b + nop.i 999 ;; +} + +// This will be the final result unless x double-extended denormal +{ .mfi + nop.m 999 +(p0) fnorm f8 = f8 + nop.i 999 ;; +} + +// If x double-extended denorm, then significand ok, but must merge in +// correct signexp +{ .mfi + nop.m 999 +(p7) fmerge.se f8 = f10,f8 + nop.i 999 ;; +} + +// Final normalization if x double-extended denorm +{ .mfb + nop.m 999 +(p7) fnorm f8 = f8 +(p0) br.ret.sptk b0 ;; +} + +.endp significandl +ASM_SIZE_DIRECTIVE(significandl) diff --git a/sysdeps/ia64/fpu/s_sin.c b/sysdeps/ia64/fpu/s_sin.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/s_sin.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_sincos.c b/sysdeps/ia64/fpu/s_sincos.c new file mode 100644 index 0000000000..1ddbc2122a --- /dev/null +++ b/sysdeps/ia64/fpu/s_sincos.c @@ -0,0 +1,9 @@ +#include <math.h> + +void +__sincos (double x, double *s, double *c) +{ + *s = sin (x); + *c = cos (x); +} +weak_alias (__sincos, sincos) diff --git a/sysdeps/ia64/fpu/s_sincosf.c b/sysdeps/ia64/fpu/s_sincosf.c new file mode 100644 index 0000000000..efd0fe3038 --- /dev/null +++ b/sysdeps/ia64/fpu/s_sincosf.c @@ -0,0 +1,9 @@ +#include <math.h> + +void +__sincosf (float x, float *s, float *c) +{ + *s = sinf (x); + *c = cosf (x); +} +weak_alias (__sincosf, sincosf) diff --git a/sysdeps/ia64/fpu/s_sincosl.c b/sysdeps/ia64/fpu/s_sincosl.c new file mode 100644 index 0000000000..a835b772e2 --- /dev/null +++ b/sysdeps/ia64/fpu/s_sincosl.c @@ -0,0 +1,9 @@ +#include <math.h> + +void +__sincosl (long double x, long double *s, long double *c) +{ + *s = sinl (x); + *c = cosl (x); +} +weak_alias (__sincosl, sincosl) diff --git a/sysdeps/ia64/fpu/s_sinf.c b/sysdeps/ia64/fpu/s_sinf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/s_sinf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_sinl.c b/sysdeps/ia64/fpu/s_sinl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/s_sinl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_tan.S b/sysdeps/ia64/fpu/s_tan.S new file mode 100644 index 0000000000..3678a42476 --- /dev/null +++ b/sysdeps/ia64/fpu/s_tan.S @@ -0,0 +1,757 @@ +.file "tan.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00 Unwind support added +// 12/27/00 Improved speed +// +// API +//============================================================== +// double tan( double x); +// +// Overview of operation +//============================================================== +// If the input value in radians is |x| >= 1.xxxxx 2^10 call the +// older slower version. +// +// The new algorithm is used when |x| <= 1.xxxxx 2^9. +// +// Represent the input X as Nfloat * pi/2 + r +// where r can be negative and |r| <= pi/4 +// +// tan_W = x * 2/pi +// Nfloat = round_int(tan_W) +// +// tan_r = x - Nfloat * (pi/2)_hi +// tan_r = tan_r - Nfloat * (pi/2)_lo +// +// We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd. +// p8: tan(X) = tan(r) +// p9: tan(X) = -cot(r) +// +// Each is evaluated as a series. The p9 path requires 1/r. +// +// The coefficients used in the series are stored in a table as +// are the pi constants. +// +// Registers used +//============================================================== +// +// predicate registers used: +// p6-10 +// +// floating-point registers used: +// f10-15, f32-105 +// f8, input +// +// general registers used +// r14-18, r32-43 +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== +TAN_INV_PI_BY_2_2TO64 = f10 +TAN_RSHF_2TO64 = f11 +TAN_2TOM64 = f12 +TAN_RSHF = f13 +TAN_W_2TO64_RSH = f14 +TAN_NFLOAT = f15 + +tan_Inv_Pi_by_2 = f32 +tan_Pi_by_2_hi = f33 +tan_Pi_by_2_lo = f34 + + +tan_P0 = f35 +tan_P1 = f36 +tan_P2 = f37 +tan_P3 = f38 +tan_P4 = f39 +tan_P5 = f40 +tan_P6 = f41 +tan_P7 = f42 +tan_P8 = f43 +tan_P9 = f44 +tan_P10 = f45 +tan_P11 = f46 +tan_P12 = f47 +tan_P13 = f48 +tan_P14 = f49 +tan_P15 = f50 + +tan_Q0 = f51 +tan_Q1 = f52 +tan_Q2 = f53 +tan_Q3 = f54 +tan_Q4 = f55 +tan_Q5 = f56 +tan_Q6 = f57 +tan_Q7 = f58 +tan_Q8 = f59 +tan_Q9 = f60 +tan_Q10 = f61 + +tan_r = f62 +tan_rsq = f63 +tan_rcube = f64 + +tan_v18 = f65 +tan_v16 = f66 +tan_v17 = f67 +tan_v12 = f68 +tan_v13 = f69 +tan_v7 = f70 +tan_v8 = f71 +tan_v4 = f72 +tan_v5 = f73 +tan_v15 = f74 +tan_v11 = f75 +tan_v14 = f76 +tan_v3 = f77 +tan_v6 = f78 +tan_v10 = f79 +tan_v2 = f80 +tan_v9 = f81 +tan_v1 = f82 +tan_int_Nfloat = f83 +tan_Nfloat = f84 + +tan_NORM_f8 = f85 +tan_W = f86 + +tan_y0 = f87 +tan_d = f88 +tan_y1 = f89 +tan_dsq = f90 +tan_y2 = f91 +tan_d4 = f92 +tan_inv_r = f93 + +tan_z1 = f94 +tan_z2 = f95 +tan_z3 = f96 +tan_z4 = f97 +tan_z5 = f98 +tan_z6 = f99 +tan_z7 = f100 +tan_z8 = f101 +tan_z9 = f102 +tan_z10 = f103 +tan_z11 = f104 +tan_z12 = f105 + + +///////////////////////////////////////////////////////////// + +tan_GR_sig_inv_pi_by_2 = r14 +tan_GR_rshf_2to64 = r15 +tan_GR_exp_2tom64 = r16 +tan_GR_n = r17 +tan_GR_rshf = r18 + +tan_AD = r33 +tan_GR_10009 = r34 +tan_GR_17_ones = r35 +tan_GR_N_odd_even = r36 +tan_GR_N = r37 +tan_signexp = r38 +tan_exp = r39 +tan_ADQ = r40 + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +double_tan_constants: +ASM_TYPE_DIRECTIVE(double_tan_constants,@object) +// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi + data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi + + data8 0xBEEA54580DDEA0E1 // P14 + data8 0x3ED3021ACE749A59 // P15 + data8 0xBEF312BD91DC8DA1 // P12 + data8 0x3EFAE9AFC14C5119 // P13 + data8 0x3F2F342BF411E769 // P8 + data8 0x3F1A60FC9F3B0227 // P9 + data8 0x3EFF246E78E5E45B // P10 + data8 0x3F01D9D2E782875C // P11 + data8 0x3F8226E34C4499B6 // P4 + data8 0x3F6D6D3F12C236AC // P5 + data8 0x3F57DA1146DCFD8B // P6 + data8 0x3F43576410FE3D75 // P7 + data8 0x3FD5555555555555 // P0 + data8 0x3FC11111111111C2 // P1 + data8 0x3FABA1BA1BA0E850 // P2 + data8 0x3F9664F4886725A7 // P3 +ASM_SIZE_DIRECTIVE(double_tan_constants) + +double_Q_tan_constants: +ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object) + data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo + data8 0x3E223A73BA576E48 // Q8 + data8 0x3DF54AD8D1F2CA43 // Q9 + data8 0x3EF66A8EE529A6AA // Q4 + data8 0x3EC2281050410EE6 // Q5 + data8 0x3E8D6BB992CC3CF5 // Q6 + data8 0x3E57F88DE34832E4 // Q7 + data8 0x3FD5555555555555 // Q0 + data8 0x3F96C16C16C16DB8 // Q1 + data8 0x3F61566ABBFFB489 // Q2 + data8 0x3F2BBD77945C1733 // Q3 + data8 0x3D927FB33E2B0E04 // Q10 +ASM_SIZE_DIRECTIVE(double_Q_tan_constants) + + + +.align 32 +.global tan# +#ifdef _LIBC +.global __tan# +#endif + +//////////////////////////////////////////////////////// + + + +.section .text +.proc tan# +#ifdef _LIBC +.proc __tan# +#endif +.align 32 +tan: +#ifdef _LIBC +__tan: +#endif +// The initial fnorm will take any unmasked faults and +// normalize any single/double unorms + +{ .mlx + alloc r32=ar.pfs,1,11,0,0 + movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi +} +{ .mlx + addl tan_AD = @ltoff(double_tan_constants), gp + movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1) +} +;; + +{ .mfi + ld8 tan_AD = [tan_AD] + fnorm tan_NORM_f8 = f8 + mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64 +} +{ .mlx + nop.m 999 + movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift +} +;; + + +// Form two constants we need +// 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand +// 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand +{ .mmi + setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2 + setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64 + mov tan_GR_17_ones = 0x1ffff ;; +} + + +// Form another constant +// 2^-64 for scaling Nfloat +// 1.1000...000 * 2^63, the right shift constant +{ .mmf + setf.exp TAN_2TOM64 = tan_GR_exp_2tom64 + adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD + fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +} +;; + + +// Form another constant +// 2^-64 for scaling Nfloat +// 1.1000...000 * 2^63, the right shift constant +{ .mmf + setf.d TAN_RSHF = tan_GR_rshf + ldfe tan_Pi_by_2_hi = [tan_AD],16 + fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf +} +;; + +{ .mfb + ldfe tan_Pi_by_2_lo = [tan_ADQ],16 + fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan +(p6) br.ret.spnt b0 ;; // Exit for x=0 +} + +{ .mfi + ldfpd tan_P14,tan_P15 = [tan_AD],16 +(p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf + mov tan_GR_10009 = 0x10009 +} +{ .mib + ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16 + nop.i 999 +(p7) br.ret.spnt b0 ;; // Exit for x=inf +} + +{ .mfi + ldfpd tan_P12,tan_P13 = [tan_AD],16 +(p8) fma.d f8=f8,f1,f8 // Set qnan if x=nan + nop.i 999 +} +{ .mib + ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16 + nop.i 999 +(p8) br.ret.spnt b0 ;; // Exit for x=nan +} + +{ .mmi + getf.exp tan_signexp = tan_NORM_f8 + ldfpd tan_P8,tan_P9 = [tan_AD],16 + nop.i 999 ;; +} + +// Multiply x by scaled 2/pi and add large const to shift integer part of W to +// rightmost bits of significand +{ .mfi + ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16 + fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64 + nop.i 999 ;; +} + +{ .mmi + ldfpd tan_P10,tan_P11 = [tan_AD],16 + nop.m 999 + and tan_exp = tan_GR_17_ones, tan_signexp ;; +} + + +// p7 is true if we must call DBX TAN +// p7 is true if f8 exp is > 0x10009 (which includes all ones +// NAN or inf) +{ .mmi + ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16 + cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009 + nop.i 999 ;; +} + + +{ .mmb + ldfpd tan_P4,tan_P5 = [tan_AD],16 + nop.m 999 +(p7) br.cond.spnt L(TAN_DBX) ;; +} + + +{ .mmi + ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16 + nop.m 999 + nop.i 999 ;; +} + + + +// TAN_NFLOAT = Round_Int_Nearest(tan_W) +{ .mfi + ldfpd tan_P6,tan_P7 = [tan_AD],16 + fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF + nop.i 999 ;; +} + + +{ .mfi + ldfd tan_Q10 = [tan_ADQ] + nop.f 999 + nop.i 999 ;; +} + + +{ .mfi + ldfpd tan_P0,tan_P1 = [tan_AD],16 + nop.f 999 + nop.i 999 ;; +} + + +{ .mfi + getf.sig tan_GR_n = TAN_W_2TO64_RSH + nop.f 999 + nop.i 999 ;; +} + +// tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x +{ .mfi + ldfpd tan_P2,tan_P3 = [tan_AD] + fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8 + nop.i 999 ;; +} + + +// p8 ==> even +// p9 ==> odd +{ .mmi + and tan_GR_N_odd_even = 0x1, tan_GR_n ;; + nop.m 999 + cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;; +} + + +// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo +{ .mfi + nop.m 999 + fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 tan_rsq = tan_r, tan_r, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) frcpa.s1 tan_y0, p10 = f1,tan_r + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fnma.s1 tan_d = tan_r, tan_y0, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0 + nop.i 999 ;; +} + + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_dsq = tan_d, tan_d, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r + br.ret.sptk b0 ;; +} +.endp tan# +ASM_SIZE_DIRECTIVE(tan) + + +.proc __libm_callout +__libm_callout: +L(TAN_DBX): +.prologue + +{ .mfi + nop.m 0 + fmerge.s f9 = f0,f0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mfb + nop.m 999 + nop.f 999 + br.call.sptk.many b0=__libm_tan# ;; +} + + +{ .mfi + mov gp = GR_SAVE_GP + fnorm.d f8 = f8 + mov b0 = GR_SAVE_B0 +} +;; + + +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +;; +} + + +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + +.type __libm_tan#,@function +.global __libm_tan# diff --git a/sysdeps/ia64/fpu/s_tanf.S b/sysdeps/ia64/fpu/s_tanf.S new file mode 100644 index 0000000000..b4493c1554 --- /dev/null +++ b/sysdeps/ia64/fpu/s_tanf.S @@ -0,0 +1,757 @@ +.file "tanf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// History +//============================================================== +// 2/02/00: Initial version +// 4/04/00 Unwind support added +// 12/27/00 Improved speed +// +// API +//============================================================== +// float tan( float x); +// +// Overview of operation +//============================================================== +// If the input value in radians is |x| >= 1.xxxxx 2^10 call the +// older slower version. +// +// The new algorithm is used when |x| <= 1.xxxxx 2^9. +// +// Represent the input X as Nfloat * pi/2 + r +// where r can be negative and |r| <= pi/4 +// +// tan_W = x * 2/pi +// Nfloat = round_int(tan_W) +// +// tan_r = x - Nfloat * (pi/2)_hi +// tan_r = tan_r - Nfloat * (pi/2)_lo +// +// We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd. +// p8: tan(X) = tan(r) +// p9: tan(X) = -cot(r) +// +// Each is evaluated as a series. The p9 path requires 1/r. +// +// The coefficients used in the series are stored in a table as +// are the pi constants. +// +// Registers used +//============================================================== +// +// predicate registers used: +// p6-10 +// +// floating-point registers used: +// f10-15, f32-105 +// f8, input +// +// general registers used +// r14-18, r32-43 +// + +#include "libm_support.h" + +// Assembly macros +//============================================================== +TAN_INV_PI_BY_2_2TO64 = f10 +TAN_RSHF_2TO64 = f11 +TAN_2TOM64 = f12 +TAN_RSHF = f13 +TAN_W_2TO64_RSH = f14 +TAN_NFLOAT = f15 + +tan_Inv_Pi_by_2 = f32 +tan_Pi_by_2_hi = f33 +tan_Pi_by_2_lo = f34 + + +tan_P0 = f35 +tan_P1 = f36 +tan_P2 = f37 +tan_P3 = f38 +tan_P4 = f39 +tan_P5 = f40 +tan_P6 = f41 +tan_P7 = f42 +tan_P8 = f43 +tan_P9 = f44 +tan_P10 = f45 +tan_P11 = f46 +tan_P12 = f47 +tan_P13 = f48 +tan_P14 = f49 +tan_P15 = f50 + +tan_Q0 = f51 +tan_Q1 = f52 +tan_Q2 = f53 +tan_Q3 = f54 +tan_Q4 = f55 +tan_Q5 = f56 +tan_Q6 = f57 +tan_Q7 = f58 +tan_Q8 = f59 +tan_Q9 = f60 +tan_Q10 = f61 + +tan_r = f62 +tan_rsq = f63 +tan_rcube = f64 + +tan_v18 = f65 +tan_v16 = f66 +tan_v17 = f67 +tan_v12 = f68 +tan_v13 = f69 +tan_v7 = f70 +tan_v8 = f71 +tan_v4 = f72 +tan_v5 = f73 +tan_v15 = f74 +tan_v11 = f75 +tan_v14 = f76 +tan_v3 = f77 +tan_v6 = f78 +tan_v10 = f79 +tan_v2 = f80 +tan_v9 = f81 +tan_v1 = f82 +tan_int_Nfloat = f83 +tan_Nfloat = f84 + +tan_NORM_f8 = f85 +tan_W = f86 + +tan_y0 = f87 +tan_d = f88 +tan_y1 = f89 +tan_dsq = f90 +tan_y2 = f91 +tan_d4 = f92 +tan_inv_r = f93 + +tan_z1 = f94 +tan_z2 = f95 +tan_z3 = f96 +tan_z4 = f97 +tan_z5 = f98 +tan_z6 = f99 +tan_z7 = f100 +tan_z8 = f101 +tan_z9 = f102 +tan_z10 = f103 +tan_z11 = f104 +tan_z12 = f105 + + +///////////////////////////////////////////////////////////// + +tan_GR_sig_inv_pi_by_2 = r14 +tan_GR_rshf_2to64 = r15 +tan_GR_exp_2tom64 = r16 +tan_GR_n = r17 +tan_GR_rshf = r18 + +tan_AD = r33 +tan_GR_10009 = r34 +tan_GR_17_ones = r35 +tan_GR_N_odd_even = r36 +tan_GR_N = r37 +tan_signexp = r38 +tan_exp = r39 +tan_ADQ = r40 + +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 + + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 16 + +double_tan_constants: +ASM_TYPE_DIRECTIVE(double_tan_constants,@object) +// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi + data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi + + data8 0xBEEA54580DDEA0E1 // P14 + data8 0x3ED3021ACE749A59 // P15 + data8 0xBEF312BD91DC8DA1 // P12 + data8 0x3EFAE9AFC14C5119 // P13 + data8 0x3F2F342BF411E769 // P8 + data8 0x3F1A60FC9F3B0227 // P9 + data8 0x3EFF246E78E5E45B // P10 + data8 0x3F01D9D2E782875C // P11 + data8 0x3F8226E34C4499B6 // P4 + data8 0x3F6D6D3F12C236AC // P5 + data8 0x3F57DA1146DCFD8B // P6 + data8 0x3F43576410FE3D75 // P7 + data8 0x3FD5555555555555 // P0 + data8 0x3FC11111111111C2 // P1 + data8 0x3FABA1BA1BA0E850 // P2 + data8 0x3F9664F4886725A7 // P3 +ASM_SIZE_DIRECTIVE(double_tan_constants) + +double_Q_tan_constants: +ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object) + data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo + data8 0x3E223A73BA576E48 // Q8 + data8 0x3DF54AD8D1F2CA43 // Q9 + data8 0x3EF66A8EE529A6AA // Q4 + data8 0x3EC2281050410EE6 // Q5 + data8 0x3E8D6BB992CC3CF5 // Q6 + data8 0x3E57F88DE34832E4 // Q7 + data8 0x3FD5555555555555 // Q0 + data8 0x3F96C16C16C16DB8 // Q1 + data8 0x3F61566ABBFFB489 // Q2 + data8 0x3F2BBD77945C1733 // Q3 + data8 0x3D927FB33E2B0E04 // Q10 +ASM_SIZE_DIRECTIVE(double_Q_tan_constants) + + + +.align 32 +.global tanf# +#ifdef _LIBC +.global __tanf# +#endif + +//////////////////////////////////////////////////////// + + + +.section .text +.proc tanf# +#ifdef _LIBC +.proc __tanf# +#endif +.align 32 +tanf: +#ifdef _LIBC +__tanf: +#endif +// The initial fnorm will take any unmasked faults and +// normalize any single/double unorms + +{ .mlx + alloc r32=ar.pfs,1,11,0,0 + movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi +} +{ .mlx + addl tan_AD = @ltoff(double_tan_constants), gp + movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1) +} +;; + +{ .mfi + ld8 tan_AD = [tan_AD] + fnorm tan_NORM_f8 = f8 + mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64 +} +{ .mlx + nop.m 999 + movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift +} +;; + + +// Form two constants we need +// 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand +// 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand +{ .mmi + setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2 + setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64 + mov tan_GR_17_ones = 0x1ffff ;; +} + + +// Form another constant +// 2^-64 for scaling Nfloat +// 1.1000...000 * 2^63, the right shift constant +{ .mmf + setf.exp TAN_2TOM64 = tan_GR_exp_2tom64 + adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD + fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +} +;; + + +// Form another constant +// 2^-64 for scaling Nfloat +// 1.1000...000 * 2^63, the right shift constant +{ .mmf + setf.d TAN_RSHF = tan_GR_rshf + ldfe tan_Pi_by_2_hi = [tan_AD],16 + fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf +} +;; + +{ .mfb + ldfe tan_Pi_by_2_lo = [tan_ADQ],16 + fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan +(p6) br.ret.spnt b0 ;; // Exit for x=0 +} + +{ .mfi + ldfpd tan_P14,tan_P15 = [tan_AD],16 +(p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf + mov tan_GR_10009 = 0x10009 +} +{ .mib + ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16 + nop.i 999 +(p7) br.ret.spnt b0 ;; // Exit for x=inf +} + +{ .mfi + ldfpd tan_P12,tan_P13 = [tan_AD],16 +(p8) fma.s f8=f8,f1,f8 // Set qnan if x=nan + nop.i 999 +} +{ .mib + ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16 + nop.i 999 +(p8) br.ret.spnt b0 ;; // Exit for x=nan +} + +{ .mmi + getf.exp tan_signexp = tan_NORM_f8 + ldfpd tan_P8,tan_P9 = [tan_AD],16 + nop.i 999 ;; +} + +// Multiply x by scaled 2/pi and add large const to shift integer part of W to +// rightmost bits of significand +{ .mfi + ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16 + fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64 + nop.i 999 ;; +} + +{ .mmi + ldfpd tan_P10,tan_P11 = [tan_AD],16 + nop.m 999 + and tan_exp = tan_GR_17_ones, tan_signexp ;; +} + + +// p7 is true if we must call DBX TAN +// p7 is true if f8 exp is > 0x10009 (which includes all ones +// NAN or inf) +{ .mmi + ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16 + cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009 + nop.i 999 ;; +} + + +{ .mmb + ldfpd tan_P4,tan_P5 = [tan_AD],16 + nop.m 999 +(p7) br.cond.spnt L(TAN_DBX) ;; +} + + +{ .mmi + ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16 + nop.m 999 + nop.i 999 ;; +} + + + +// TAN_NFLOAT = Round_Int_Nearest(tan_W) +{ .mfi + ldfpd tan_P6,tan_P7 = [tan_AD],16 + fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF + nop.i 999 ;; +} + + +{ .mfi + ldfd tan_Q10 = [tan_ADQ] + nop.f 999 + nop.i 999 ;; +} + + +{ .mfi + ldfpd tan_P0,tan_P1 = [tan_AD],16 + nop.f 999 + nop.i 999 ;; +} + + +{ .mfi + getf.sig tan_GR_n = TAN_W_2TO64_RSH + nop.f 999 + nop.i 999 ;; +} + +// tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x +{ .mfi + ldfpd tan_P2,tan_P3 = [tan_AD] + fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8 + nop.i 999 ;; +} + + +// p8 ==> even +// p9 ==> odd +{ .mmi + and tan_GR_N_odd_even = 0x1, tan_GR_n ;; + nop.m 999 + cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;; +} + + +// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo +{ .mfi + nop.m 999 + fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 + fma.s1 tan_rsq = tan_r, tan_r, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) frcpa.s1 tan_y0, p10 = f1,tan_r + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fnma.s1 tan_d = tan_r, tan_y0, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0 + nop.i 999 ;; +} + + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_dsq = tan_d, tan_d, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6 + nop.i 999 ;; +} + + +{ .mfi + nop.m 999 +(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0 + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2 + nop.i 999 ;; +} + + + +{ .mfi + nop.m 999 +(p8) fma.s.s0 f8 = tan_v1, tan_rcube, tan_r + nop.i 999 +} +{ .mfb + nop.m 999 +(p9) fms.s.s0 f8 = tan_r, tan_z1, tan_inv_r + br.ret.sptk b0 ;; +} +.endp tanf# +ASM_SIZE_DIRECTIVE(tanf#) + + +.proc __libm_callout +__libm_callout: +L(TAN_DBX): +.prologue + +{ .mfi + nop.m 0 + fmerge.s f9 = f0,f0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs +} +;; + +{ .mfi + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 +} + +.body +{ .mfb + nop.m 999 + nop.f 999 + br.call.sptk.many b0=__libm_tan# ;; +} + + +{ .mfi + mov gp = GR_SAVE_GP + fnorm.s f8 = f8 + mov b0 = GR_SAVE_B0 +} +;; + + +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +;; +} + + +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + +.type __libm_tan#,@function +.global __libm_tan# diff --git a/sysdeps/ia64/fpu/s_tanl.S b/sysdeps/ia64/fpu/s_tanl.S new file mode 100644 index 0000000000..d7cc3ee5ab --- /dev/null +++ b/sysdeps/ia64/fpu/s_tanl.S @@ -0,0 +1,3057 @@ +.file "tanl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// ********************************************************************* +// +// History: +// +// 2/02/2000 (hand-optimized) +// 4/04/00 Unwind support added +// 12/28/00 Fixed false invalid flags +// +// ********************************************************************* +// +// Function: tanl(x) = tangent(x), for double-extended precision x values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9-f15 +// f32-f112 +// +// General Purpose Registers: +// r32-r48 +// r49-r50 (Used to pass arguments to pi_by_2 reduce routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions do not occur +// Underflow exceptions raised when appropriate for tan +// (No specialized error handling for this routine) +// Inexact raised when appropriate by algorithm +// +// tan(SNaN) = QNaN +// tan(QNaN) = QNaN +// tan(inf) = QNaN +// tan(+/-0) = +/-0 +// +// ********************************************************************* +// +// Mathematical Description +// +// We consider the computation of FPTANL of Arg. Now, given +// +// Arg = N pi/2 + alpha, |alpha| <= pi/4, +// +// basic mathematical relationship shows that +// +// tan( Arg ) = tan( alpha ) if N is even; +// = -cot( alpha ) otherwise. +// +// The value of alpha is obtained by argument reduction and +// represented by two working precision numbers r and c where +// +// alpha = r + c accurately. +// +// The reduction method is described in a previous write up. +// The argument reduction scheme identifies 4 cases. For Cases 2 +// and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be +// computed very easily by 2 or 3 terms of the Taylor series +// expansion as follows: +// +// Case 2: +// ------- +// +// tan(r + c) = r + c + r^3/3 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 ...accurately +// +// Case 4: +// ------- +// +// tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately +// +// +// The only cases left are Cases 1 and 3 of the argument reduction +// procedure. These two cases will be merged since after the +// argument is reduced in either cases, we have the reduced argument +// represented as r + c and that the magnitude |r + c| is not small +// enough to allow the usage of a very short approximation. +// +// The greatest challenge of this task is that the second terms of +// the Taylor series for tan(r) and -cot(r) +// +// r + r^3/3 + 2 r^5/15 + ... +// +// and +// +// -1/r + r/3 + r^3/45 + ... +// +// are not very small when |r| is close to pi/4 and the rounding +// errors will be a concern if simple polynomial accumulation is +// used. When |r| < 2^(-2), however, the second terms will be small +// enough (5 bits or so of right shift) that a normal Horner +// recurrence suffices. Hence there are two cases that we consider +// in the accurate computation of tan(r) and cot(r), |r| <= pi/4. +// +// Case small_r: |r| < 2^(-2) +// -------------------------- +// +// Since Arg = N pi/4 + r + c accurately, we have +// +// tan(Arg) = tan(r+c) for N even, +// = -cot(r+c) otherwise. +// +// Here for this case, both tan(r) and -cot(r) can be approximated +// by simple polynomials: +// +// tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// +// accurately. Since |r| is relatively small, tan(r+c) and +// -cot(r+c) can be accurately approximated by replacing r with +// r+c only in the first two terms of the corresponding polynomials. +// +// Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to +// almost 64 sig. bits, thus +// +// P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately. +// +// Hence, +// +// tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// + c*(1 + r^2) +// +// -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c +// +// +// Case normal_r: 2^(-2) <= |r| <= pi/4 +// ------------------------------------ +// +// This case is more likely than the previous one if one considers +// r to be uniformly distributed in [-pi/4 pi/4]. +// +// The required calculation is either +// +// tan(r + c) = tan(r) + correction, or +// -cot(r + c) = -cot(r) + correction. +// +// Specifically, +// +// tan(r + c) = tan(r) + c tan'(r) + O(c^2) +// = tan(r) + c sec^2(r) + O(c^2) +// = tan(r) + c SEC_sq ...accurately +// as long as SEC_sq approximates sec^2(r) +// to, say, 5 bits or so. +// +// Similarly, +// +// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) +// = -cot(r) + c csc^2(r) + O(c^2) +// = -cot(r) + c CSC_sq ...accurately +// as long as CSC_sq approximates csc^2(r) +// to, say, 5 bits or so. +// +// We therefore concentrate on accurately calculating tan(r) and +// cot(r) for a working-precision number r, |r| <= pi/4 to within +// 0.1% or so. +// +// We will employ a table-driven approach. Let +// +// r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63 +// = sgn_r * ( B + x ) +// +// where +// +// B = 2^k * 1.b_1 b_2 ... b_5 1 +// x = |r| - B +// +// Now, +// tan(B) + tan(x) +// tan( B + x ) = ------------------------ +// 1 - tan(B)*tan(x) +// +// / \ +// | tan(B) + tan(x) | + +// = tan(B) + | ------------------------ - tan(B) | +// | 1 - tan(B)*tan(x) | +// \ / +// +// sec^2(B) * tan(x) +// = tan(B) + ------------------------ +// 1 - tan(B)*tan(x) +// +// (1/[sin(B)*cos(B)]) * tan(x) +// = tan(B) + -------------------------------- +// cot(B) - tan(x) +// +// +// Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Since +// +// |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2) +// +// a very short polynomial will be sufficient to approximate tan(x) +// accurately. The details involved in computing the last expression +// will be given in the next section on algorithm description. +// +// +// Now, we turn to the case where cot( B + x ) is needed. +// +// +// 1 - tan(B)*tan(x) +// cot( B + x ) = ------------------------ +// tan(B) + tan(x) +// +// / \ +// | 1 - tan(B)*tan(x) | + +// = cot(B) + | ----------------------- - cot(B) | +// | tan(B) + tan(x) | +// \ / +// +// [tan(B) + cot(B)] * tan(x) +// = cot(B) - ---------------------------- +// tan(B) + tan(x) +// +// (1/[sin(B)*cos(B)]) * tan(x) +// = cot(B) - -------------------------------- +// tan(B) + tan(x) +// +// +// Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that +// are needed are the same set of values needed in the previous +// case. +// +// Finally, we can put all the ingredients together as follows: +// +// Arg = N * pi/2 + r + c ...accurately +// +// tan(Arg) = tan(r) + correction if N is even; +// = -cot(r) + correction otherwise. +// +// For Cases 2 and 4, +// +// Case 2: +// tan(Arg) = tan(r + c) = r + c + r^3/3 N even +// = -cot(r + c) = -1/(r+c) + r/3 N odd +// Case 4: +// tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even +// = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd +// +// +// For Cases 1 and 3, +// +// Case small_r: |r| < 2^(-2) +// +// tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 +// + c*(1 + r^2) N even +// +// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c N odd +// +// Case normal_r: 2^(-2) <= |r| <= pi/4 +// +// tan(Arg) = tan(r) + c * sec^2(r) N even +// = -cot(r) + c * csc^2(r) otherwise +// +// For N even, +// +// tan(Arg) = tan(r) + c*sec^2(r) +// = tan( sgn_r * (B+x) ) + c * sec^2(|r|) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) +// +// since B approximates |r| to 2^(-6) in relative accuracy. +// +// / (1/[sin(B)*cos(B)]) * tan(x) +// tan(Arg) = sgn_r * | tan(B) + -------------------------------- +// \ cot(B) - tan(x) +// \ +// + CORR | + +// / +// where +// +// CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). +// +// For N odd, +// +// tan(Arg) = -cot(r) + c*csc^2(r) +// = -cot( sgn_r * (B+x) ) + c * csc^2(|r|) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) +// +// since B approximates |r| to 2^(-6) in relative accuracy. +// +// / (1/[sin(B)*cos(B)]) * tan(x) +// tan(Arg) = sgn_r * | -cot(B) + -------------------------------- +// \ tan(B) + tan(x) +// \ +// + CORR | + +// / +// where +// +// CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)). +// +// +// The actual algorithm prescribes how all the mathematical formulas +// are calculated. +// +// +// 2. Algorithmic Description +// ========================== +// +// 2.1 Computation for Cases 2 and 4. +// ---------------------------------- +// +// For Case 2, we use two-term polynomials. +// +// For N even, +// +// rsq := r * r +// Result := c + r * rsq * P1_1 +// Result := r + Result ...in user-defined rounding +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// S_lo := S_lo + Q1_1*r +// +// Result := S_hi + S_lo ...in user-defined rounding +// +// For Case 4, we use three-term polynomials +// +// For N even, +// +// rsq := r * r +// Result := c + r * rsq * (P1_1 + rsq * P1_2) +// Result := r + Result ...in user-defined rounding +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// rsq := r * r +// P := Q1_1 + rsq*Q1_2 +// S_lo := S_lo + r*P +// +// Result := S_hi + S_lo ...in user-defined rounding +// +// +// Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are +// the same as those used in the small_r case of Cases 1 and 3 +// below. +// +// +// 2.2 Computation for Cases 1 and 3. +// ---------------------------------- +// This is further divided into the case of small_r, +// where |r| < 2^(-2), and the case of normal_r, where |r| lies between +// 2^(-2) and pi/4. +// +// Algorithm for the case of small_r +// --------------------------------- +// +// For N even, +// rsq := r * r +// Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3)) +// r_to_the_8 := rsq * rsq +// r_to_the_8 := r_to_the_8 * r_to_the_8 +// Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9)) +// CORR := c * ( 1 + rsq ) +// Poly := Poly1 + r_to_the_8*Poly2 +// Result := r*Poly + CORR +// Result := r + Result ...in user-defined rounding +// ...note that Poly1 and r_to_the_8 can be computed in parallel +// ...with Poly2 (Poly1 is intentionally set to be much +// ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden) +// +// For N odd, +// S_hi := -frcpa(r) ...8 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits +// S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits +// S_lo := S_hi*( (1 + S_hi*r) + S_hi*c ) +// ...S_hi + S_lo is -1/(r+c) to extra precision +// S_lo := S_lo + Q1_1*c +// +// ...S_hi and S_lo are computed in parallel with +// ...the following +// rsq := r*r +// P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7)) +// +// Result := r*P + S_lo +// Result := S_hi + Result ...in user-defined rounding +// +// +// Algorithm for the case of normal_r +// ---------------------------------- +// +// Here, we first consider the computation of tan( r + c ). As +// presented in the previous section, +// +// tan( r + c ) = tan(r) + c * sec^2(r) +// = sgn_r * [ tan(B+x) + CORR ] +// CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)] +// +// because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits. +// +// tan( r + c ) = +// / (1/[sin(B)*cos(B)]) * tan(x) +// sgn_r * | tan(B) + -------------------------------- + +// \ cot(B) - tan(x) +// \ +// CORR | + +// / +// +// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Specifically, +// the table values are +// +// tan(B) as T_hi + T_lo; +// cot(B) as C_hi + C_lo; +// 1/[sin(B)*cos(B)] as SC_inv +// +// T_hi, C_hi are in double-precision memory format; +// T_lo, C_lo are in single-precision memory format; +// SC_inv is in extended-precision memory format. +// +// The value of tan(x) will be approximated by a short polynomial of +// the form +// +// tan(x) as x + x * P, where +// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) +// +// Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x) +// to a relative accuracy better than 2^(-20). Thus, a good +// initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative +// division is: +// +// 1/(cot(B) - tan(x)) is approximately +// 1/(cot(B) - x) is +// tan(B)/(1 - x*tan(B)) is approximately +// T_hi / ( 1 - T_hi * x ) is approximately +// +// T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ] +// +// The calculation of tan(r+c) therefore proceed as follows: +// +// Tx := T_hi * x +// xsq := x * x +// +// V_hi := T_hi*(1 + Tx*(1 + Tx)) +// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) +// ...V_hi serves as an initial guess of 1/(cot(B) - tan(x)) +// ...good to about 20 bits of accuracy +// +// tanx := x + x*P +// D := C_hi - tanx +// ...D is a double precision denominator: cot(B) - tan(x) +// +// V_hi := V_hi + V_hi*(1 - V_hi*D) +// ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits +// +// V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ] +// - V_hi*C_lo ) ...observe all order +// ...V_hi + V_lo approximates 1/(cot(B) - tan(x)) +// ...to extra accuracy +// +// ... SC_inv(B) * (x + x*P) +// ... tan(B) + ------------------------- + CORR +// ... cot(B) - (x + x*P) +// ... +// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// +// Sx := SC_inv * x +// CORR := sgn_r * c * SC_inv * T_hi +// +// ...put the ingredients together to compute +// ... SC_inv(B) * (x + x*P) +// ... tan(B) + ------------------------- + CORR +// ... cot(B) - (x + x*P) +// ... +// ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// ... = T_hi + T_lo + CORR + +// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) +// +// CORR := CORR + T_lo +// tail := V_lo + P*(V_hi + V_lo) +// tail := Sx * tail + CORR +// tail := Sx * V_hi + tail +// T_hi := sgn_r * T_hi +// +// ...T_hi + sgn_r*tail now approximate +// ...sgn_r*(tan(B+x) + CORR) accurately +// +// Result := T_hi + sgn_r*tail ...in user-defined +// ...rounding control +// ...It is crucial that independent paths be fully +// ...exploited for performance's sake. +// +// +// Next, we consider the computation of -cot( r + c ). As +// presented in the previous section, +// +// -cot( r + c ) = -cot(r) + c * csc^2(r) +// = sgn_r * [ -cot(B+x) + CORR ] +// CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)] +// +// because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits. +// +// -cot( r + c ) = +// / (1/[sin(B)*cos(B)]) * tan(x) +// sgn_r * | -cot(B) + -------------------------------- + +// \ tan(B) + tan(x) +// \ +// CORR | + +// / +// +// The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are +// calculated beforehand and stored in a table. Specifically, +// the table values are +// +// tan(B) as T_hi + T_lo; +// cot(B) as C_hi + C_lo; +// 1/[sin(B)*cos(B)] as SC_inv +// +// T_hi, C_hi are in double-precision memory format; +// T_lo, C_lo are in single-precision memory format; +// SC_inv is in extended-precision memory format. +// +// The value of tan(x) will be approximated by a short polynomial of +// the form +// +// tan(x) as x + x * P, where +// P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3)) +// +// Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x) +// to a relative accuracy better than 2^(-18). Thus, a good +// initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative +// division is: +// +// 1/(tan(B) + tan(x)) is approximately +// 1/(tan(B) + x) is +// cot(B)/(1 + x*cot(B)) is approximately +// C_hi / ( 1 + C_hi * x ) is approximately +// +// C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ] +// +// The calculation of -cot(r+c) therefore proceed as follows: +// +// Cx := C_hi * x +// xsq := x * x +// +// V_hi := C_hi*(1 - Cx*(1 - Cx)) +// P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3)) +// ...V_hi serves as an initial guess of 1/(tan(B) + tan(x)) +// ...good to about 18 bits of accuracy +// +// tanx := x + x*P +// D := T_hi + tanx +// ...D is a double precision denominator: tan(B) + tan(x) +// +// V_hi := V_hi + V_hi*(1 - V_hi*D) +// ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits +// +// V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ] +// - V_hi*T_lo ) ...observe all order +// ...V_hi + V_lo approximates 1/(tan(B) + tan(x)) +// ...to extra accuracy +// +// ... SC_inv(B) * (x + x*P) +// ... -cot(B) + ------------------------- + CORR +// ... tan(B) + (x + x*P) +// ... +// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// +// Sx := SC_inv * x +// CORR := sgn_r * c * SC_inv * C_hi +// +// ...put the ingredients together to compute +// ... SC_inv(B) * (x + x*P) +// ... -cot(B) + ------------------------- + CORR +// ... tan(B) + (x + x*P) +// ... +// ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR +// ... +// ... =-C_hi - C_lo + CORR + +// ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo) +// +// CORR := CORR - C_lo +// tail := V_lo + P*(V_hi + V_lo) +// tail := Sx * tail + CORR +// tail := Sx * V_hi + tail +// C_hi := -sgn_r * C_hi +// +// ...C_hi + sgn_r*tail now approximates +// ...sgn_r*(-cot(B+x) + CORR) accurately +// +// Result := C_hi + sgn_r*tail in user-defined rounding control +// ...It is crucial that independent paths be fully +// ...exploited for performance's sake. +// +// 3. Implementation Notes +// ======================= +// +// Table entries T_hi, T_lo; C_hi, C_lo; SC_inv +// +// Recall that 2^(-2) <= |r| <= pi/4; +// +// r = sgn_r * 2^k * 1.b_1 b_2 ... b_63 +// +// and +// +// B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1 +// +// Thus, for k = -2, possible values of B are +// +// B = 2^(-2) * ( 1 + index/32 + 1/64 ), +// index ranges from 0 to 31 +// +// For k = -1, however, since |r| <= pi/4 = 0.78... +// possible values of B are +// +// B = 2^(-1) * ( 1 + index/32 + 1/64 ) +// index ranges from 0 to 19. +// +// + +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 128 + +TANL_BASE_CONSTANTS: +ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object) +data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24 + // two**-14, -two**-14 +data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi +data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0 +data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1 +data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2 +data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3 +data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63 +data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0 +data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1 +data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2 +data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4 +data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4 +data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2 +data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33 +data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1 +data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2 +data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3 +data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4 +data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5 +data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6 +data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7 +data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8 +data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9 +data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1 +data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2 +data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3 +data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4 +data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5 +data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6 +data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7 +data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1 +data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2 +data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3 +// +// Entries T_hi double-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// Entries T_lo single-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000 +data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000 +data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000 +data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000 +data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000 +data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000 +data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000 +data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000 +data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000 +data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000 +data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000 +data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000 +data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000 +data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000 +data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000 +data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000 +data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000 +data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000 +data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000 +data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000 +data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000 +data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000 +data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000 +data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000 +data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000 +data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000 +data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000 +data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000 +data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000 +data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000 +data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000 +data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000 +// +// Entries T_hi double-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// Entries T_lo single-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000 +data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000 +data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000 +data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000 +data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000 +data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000 +data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000 +data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000 +data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000 +data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000 +data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000 +data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000 +data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000 +data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000 +data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000 +data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000 +data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000 +data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000 +data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000 +data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000 +// +// Entries C_hi double-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// Entries C_lo single-precision memory format +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000 +data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000 +data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000 +data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000 +data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000 +data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000 +data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000 +data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000 +data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000 +data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000 +data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000 +data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000 +data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000 +data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000 +data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000 +data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000 +data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000 +data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000 +data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000 +data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000 +data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000 +data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000 +data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000 +data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000 +data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000 +data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000 +data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000 +data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000 +data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000 +data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000 +data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000 +data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000 +// +// Entries C_hi double-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// Entries C_lo single-precision memory format +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000 +data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000 +data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000 +data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000 +data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000 +data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000 +data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000 +data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000 +data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000 +data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000 +data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000 +data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000 +data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000 +data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000 +data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000 +data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000 +data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000 +data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000 +data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000 +data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000 +// +// Entries SC_inv in Swapped IEEE format (extended) +// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) +// +data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000 +data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000 +data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000 +data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000 +data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000 +data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000 +data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000 +data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000 +data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000 +data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000 +data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000 +data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000 +data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000 +data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000 +data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000 +data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000 +data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000 +data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000 +data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000 +data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000 +data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000 +data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000 +data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000 +data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000 +data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000 +data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000 +data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000 +data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000 +data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000 +data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000 +data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000 +data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000 +// +// Entries SC_inv in Swapped IEEE format (extended) +// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) +// +data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000 +data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000 +data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000 +data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000 +data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000 +data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000 +data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000 +data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000 +data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000 +data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000 +data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000 +data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000 +data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000 +data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000 +data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000 +data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000 +data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000 +data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000 +data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000 +data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000 +ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS) + +Arg = f8 +Result = f8 +fp_tmp = f9 +U_2 = f10 +rsq = f11 +C_hi = f12 +C_lo = f13 +T_hi = f14 +T_lo = f15 + +N_0 = f32 +d_1 = f33 +MPI_BY_4 = f34 +tail = f35 +tanx = f36 +Cx = f37 +Sx = f38 +sgn_r = f39 +CORR = f40 +P = f41 +D = f42 +ArgPrime = f43 +P_0 = f44 + +P2_1 = f45 +P2_2 = f46 +P2_3 = f47 + +P1_1 = f45 +P1_2 = f46 +P1_3 = f47 + +P1_4 = f48 +P1_5 = f49 +P1_6 = f50 +P1_7 = f51 +P1_8 = f52 +P1_9 = f53 + +TWO_TO_63 = f54 +NEGTWO_TO_63 = f55 +x = f56 +xsq = f57 +Tx = f58 +Tx1 = f59 +Set = f60 +poly1 = f61 +poly2 = f62 +Poly = f63 +Poly1 = f64 +Poly2 = f65 +r_to_the_8 = f66 +B = f67 +SC_inv = f68 +Pos_r = f69 +N_0_fix = f70 +PI_BY_4 = f71 +NEGTWO_TO_NEG2 = f72 +TWO_TO_24 = f73 +TWO_TO_NEG14 = f74 +TWO_TO_NEG33 = f75 +NEGTWO_TO_24 = f76 +NEGTWO_TO_NEG14 = f76 +NEGTWO_TO_NEG33 = f77 +two_by_PI = f78 +N = f79 +N_fix = f80 +P_1 = f81 +P_2 = f82 +P_3 = f83 +s_val = f84 +w = f85 +c = f86 +r = f87 +A = f89 +a = f90 +t = f91 +U_1 = f92 +d_2 = f93 +TWO_TO_NEG2 = f94 +Q1_1 = f95 +Q1_2 = f96 +Q1_3 = f97 +Q1_4 = f98 +Q1_5 = f99 +Q1_6 = f100 +Q1_7 = f101 +Q1_8 = f102 +S_hi = f103 +S_lo = f104 +V_hi = f105 +V_lo = f106 +U_hi = f107 +U_lo = f108 +U_hiabs = f109 +V_hiabs = f110 +V = f111 +Inv_P_0 = f112 + +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_SAVE_PFS = r35 +delta1 = r36 +table_ptr1 = r37 +table_ptr2 = r38 +i_0 = r39 +i_1 = r40 +N_fix_gr = r41 +N_inc = r42 +exp_Arg = r43 +exp_r = r44 +sig_r = r45 +lookup = r46 +table_offset = r47 +Create_B = r48 +gr_tmp = r49 + +.section .text +.global tanl +.proc tanl +tanl: +#ifdef _LIBC +.global __tanl +.proc __tanl +__tanl: +#endif +{ .mfi +alloc r32 = ar.pfs, 0,17,2,0 +(p0) fclass.m.unc p6,p0 = Arg, 0x1E7 + addl gr_tmp = -1,r0 +} +{ .mfi + nop.m 0 +(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF + nop.i 0 +};; + +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp + nop.f 999 + nop.i 0 +} +;; +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact + nop.i 999 +} +;; + +// +// Check for NatVals, Infs , NaNs, and Zeros +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan. +// Local table pointer +// +{ .mbb +(p0) add table_ptr2 = 96, table_ptr1 +(p6) br.cond.spnt L(TANL_SPECIAL) +(p7) br.cond.spnt L(TANL_SPECIAL) ;; +} +// +// Point to Inv_P_0 +// Branch out to deal with unsupporteds and special values. +// +{ .mmf +(p0) ldfs TWO_TO_24 = [table_ptr1],4 +(p0) ldfs TWO_TO_63 = [table_ptr2],4 +// +// Load -2**24, load -2**63. +// +(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;; +} +{ .mfi +(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12 +(p0) fnorm.s1 Arg = Arg + nop.i 999 +} +// +// Load 2**24, Load 2**63. +// +{ .mmi +(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;; +// +// Do fcmp to generate Denormal exception +// - can't do FNORM (will generate Underflow when U is unmasked!) +// Normalize input argument. +// +(p0) ldfe two_by_PI = [table_ptr1],16 + nop.i 999 +} +{ .mmi +(p0) ldfe Inv_P_0 = [table_ptr2],16 ;; +(p0) ldfe d_1 = [table_ptr2],16 + nop.i 999 +} +// +// Decide about the paths to take: +// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2 +// OTHERWISE - CASE 3 OR 4 +// Load inverse of P_0 . +// Set PR_6 if Arg <= -2**63 +// Are there any Infs, NaNs, or zeros? +// +{ .mmi +(p0) ldfe P_0 = [table_ptr1],16 ;; +(p0) ldfe d_2 = [table_ptr2],16 + nop.i 999 +} +// +// Set PR_8 if Arg <= -2**24 +// Set PR_6 if Arg >= 2**63 +// +{ .mmi +(p0) ldfe P_1 = [table_ptr1],16 ;; +(p0) ldfe PI_BY_4 = [table_ptr2],16 + nop.i 999 +} +// +// Set PR_8 if Arg >= 2**24 +// +{ .mmi +(p0) ldfe P_2 = [table_ptr1],16 ;; +(p0) ldfe MPI_BY_4 = [table_ptr2],16 + nop.i 999 +} +// +// Load P_2 and PI_BY_4 +// +{ .mfi +(p0) ldfe P_3 = [table_ptr1],16 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63 + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24 + nop.i 999 ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Load P_3 and -PI_BY_4 +// +(p6) br.cond.spnt L(TANL_ARG_TOO_LARGE) ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Load 2**(-2). +// Load -2**(-2). +// Branch out if we have a special argument. +// Branch out if the magnitude of the input argument is too large +// - do this branch before the next. +// +(p8) br.cond.spnt L(TANL_LARGER_ARG) ;; +} +// +// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 +// +{ .mfi +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 +// ARGUMENT REDUCTION CODE - CASE 1 and 2 +// Load 2**(-2). +// Load -2**(-2). +(p0) fmpy.s1 N = Arg,two_by_PI + nop.i 999 ;; +} +{ .mfi +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12 +// +// N = Arg * 2/pi +// +(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if Arg < pi/4, set PR_8. +// +(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4 + nop.i 999 ;; +} +// +// Case 1: Is |r| < 2**(-2). +// Arg is the same as r in this case. +// r = Arg +// c = 0 +// +{ .mfi +(p8) mov N_fix_gr = r0 +// +// if Arg > -pi/4, reset PR_8. +// Select the case when |Arg| < pi/4 - set PR[8] = true. +// Else Select the case when |Arg| >= pi/4 - set PR[9] = true. +// +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Grab the integer part of N . +// +(p8) mov r = Arg + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) mov c = f0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 2: Place integer part of N in GP register. +// +(p9) fcvt.xf N = N_fix + nop.i 999 ;; +} +{ .mib +(p9) getf.sig N_fix_gr = N_fix + nop.i 999 +// +// Case 2: Convert integer N_fix back to normalized floating-point value. +// +(p10) br.cond.spnt L(TANL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.sptk L(TANL_NORMAL_R) ;; +} +// +// Case 1: PR_3 is only affected when PR_1 is set. +// +{ .mmi +(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;; +// +// Case 2: Load 2**(-33). +// +(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 2: Load -2**(-33). +// +(p9) fnma.s1 s_val = N, P_1, Arg + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fmpy.s1 w = N, P_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 2: w = N * P_2 +// Case 2: s_val = -N * P_1 + Arg +// +(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Decide between case_1 and case_2 reduce: +// +(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33) +// Case 2_reduce: -2**(-33) < s < 2**(-33) +// +(p8) fsub.s1 r = s_val, w + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fma.s1 U_1 = N, P_2, w + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10 +// else set PR_11. +// +(p8) fsub.s1 c = s_val, r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 1_reduce: r = s + w (change sign) +// Case 2_reduce: w = N * P_3 (change sign) +// +(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fsub.s1 r = s_val, U_1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 1_reduce: c is complete here. +// c = c + w (w has not been negated.) +// Case 2_reduce: r is complete here - continue to calculate c . +// r = s - U_1 +// +(p9) fms.s1 U_2 = N, P_2, U_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 1_reduce: c = s - r +// Case 2_reduce: U_1 = N * P_2 + w +// +(p8) fsub.s1 c = c, w + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fsub.s1 s_val = s_val, r + nop.i 999 +} +{ .mfb + nop.m 999 +// +// Case 2_reduce: +// U_2 = N * P_2 - U_1 +// Not needed until later. +// +(p9) fadd.s1 U_2 = U_2, w +// +// Case 2_reduce: +// s = s - r +// U_2 = U_2 + w +// +(p10) br.cond.spnt L(TANL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p11) br.cond.sptk L(TANL_NORMAL_R) ;; +} +{ .mii + nop.m 999 +// +// Case 2_reduce: +// c = c - U_2 +// c is complete here +// Argument reduction ends here. +// +(p9) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;; +} +{ .mfi + nop.m 999 +// +// Is i_1 even or odd? +// if i_1 == 0, set p11, else set p12. +// +(p11) fmpy.s1 rsq = r, r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) frcpa.s1 S_hi,p0 = f1, r + nop.i 999 +} + + + +// +// Case 1: Branch to SMALL_R or NORMAL_R. +// Case 1 is done now. +// + +{ .mfi +(p9) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p9) fsub.s1 c = s_val, U_1 + nop.i 999 ;; +} +;; + +{ .mmi +(p9) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi +(p9) add table_ptr1 = 224, table_ptr1 ;; +(p9) ldfe P1_1 = [table_ptr1],144 + nop.i 999 ;; +} +// +// Get [i_1] - lsb of N_fix_gr . +// Load P1_1 and point to Q1_1 . +// +{ .mfi +(p9) ldfe Q1_1 = [table_ptr1] , 0 +// +// N even: rsq = r * Z +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 2_reduce: +// c = s - U_1 +// +(p9) fsub.s1 c = c, U_2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: Change sign of S_hi +// +(p11) fmpy.s1 rsq = rsq, P1_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: rsq = rsq * P1_1 +// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary +// +(p11) fma.s1 Result = r, rsq, c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = c + r * rsq +// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = Result + r +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// +(p11) fadd.s0 Result = r, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result1 = Result + r +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * poly + 1.0 64 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: S_lo = S_hi * poly1 +// +(p12) fma.s1 S_lo = Q1_1, r, S_lo + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N odd: Result = S_hi + S_lo +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// N odd: S_lo = S_lo + Q1_1 * r +// +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; +} + + +L(TANL_LARGER_ARG): + +// +// ARGUMENT REDUCTION CODE - CASE 3 and 4 +// + +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p0) fmpy.s1 N_0 = Arg, Inv_P_0 + nop.i 999 +} +;; + +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +// +// Adjust table_ptr1 to beginning of table. +// N_0 = Arg * Inv_P_0 +// +{ .mmi +(p0) add table_ptr1 = 8, table_ptr1 ;; +// +// Point to 2*-14 +// +(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4 + nop.i 999 ;; +} +// +// Load 2**(-14). +// +{ .mmi +(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;; +// +// N_0_fix = integer part of N_0 . +// Adjust table_ptr1 to beginning of table. +// +(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4 + nop.i 999 ;; +} +// +// Make N_0 the integer part. +// +{ .mfi +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1] +// +// Load -2**(-14). +// +(p0) fcvt.fx.s1 N_0_fix = N_0 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fcvt.xf N_0 = N_0_fix + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fnma.s1 ArgPrime = N_0, P_0, Arg + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 w = N_0, d_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// ArgPrime = -N_0 * P_0 + Arg +// w = N_0 * d_1 +// +(p0) fmpy.s1 N = ArgPrime, two_by_PI + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N = ArgPrime * 2/pi +// +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N_fix is the integer part. +// +(p0) fcvt.xf N = N_fix + nop.i 999 ;; +} +{ .mfi +(p0) getf.sig N_fix_gr = N_fix + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N is the integer part of the reduced-reduced argument. +// Put the integer in a GP register. +// +(p0) fnma.s1 s_val = N, P_1, ArgPrime + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fnma.s1 w = N, P_2, w + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// s_val = -N*P_1 + ArgPrime +// w = -N*P_2 + w +// +(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p10) fmpy.s1 V_hi = N, P_2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p11) fmpy.s1 V_hi = N, P_2 + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Decide between case 3 and 4: +// Case 3: s <= -2**(-14) or s >= 2**(-14) +// Case 4: -2**(-14) < s < 2**(-14) +// +(p10) fadd.s1 r = s_val, w + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: We need abs of both U_hi and V_hi - dont +// worry about switched sign of V_hi . +// +(p11) fsub.s1 A = U_hi, V_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 4: A = U_hi + V_hi +// Note: Worry about switched sign of V_hi, so subtract instead of add. +// +(p11) fnma.s1 V_lo = N, P_2, V_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fms.s1 U_lo = N_0, d_1, U_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fabs V_hiabs = V_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +(p10) fadd.s1 r = s_val, w + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 3: c = s_val - r +// Case 4: U_lo = N_0 * d_1 - U_hi +// +(p11) fabs U_hiabs = U_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: Set P_12 if U_hiabs >= V_hiabs +// +(p11) fadd.s1 C_hi = s_val, A + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: C_hi = s_val + A +// +(p11) fadd.s1 t = U_lo, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 3: Is |r| < 2**(-2), if so set PR_7 +// else set PR_8. +// Case 3: If PR_7 is set, prepare to branch to Small_R. +// Case 3: If PR_8 is set, prepare to branch to Normal_R. +// +(p10) fsub.s1 c = s_val, r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 3: c = (s - r) + w (c complete) +// +(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fms.s1 w = N_0, d_2, w + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +// +(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_hi . +// Case 4: w = w + N_0 * d_2 +// Note: the (-) is now incorporated in w . +// +(p10) fadd.s1 c = c, w +// +// Case 4: t = U_lo + V_lo +// Note: remember V_lo should be (-), subtract instead of add. NO +// +(p14) br.cond.spnt L(TANL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p15) br.cond.spnt L(TANL_NORMAL_R) ;; +} +{ .mfi + nop.m 999 +// +// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true. +// The remaining stuff is for Case 4. +// +(p12) fsub.s1 a = U_hi, A +(p11) extr.u i_1 = N_fix_gr, 0, 1 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: C_lo = s_val - C_hi +// +(p11) fadd.s1 t = t, w + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fadd.s1 a = V_hi, A + nop.i 999 ;; +} + + + +// +// Case 4: a = U_hi - A +// a = V_hi - A (do an add to account for missing (-) on V_hi +// + +{ .mfi +(p11) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p11) fsub.s1 C_lo = s_val, C_hi + nop.i 999 +} +;; + + + +// +// Case 4: a = (U_hi - A) + V_hi +// a = (V_hi - A) + U_hi +// In each case account for negative missing form V_hi . +// + + +{ .mmi +(p11) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +// +// Case 4: C_lo = (s_val - C_hi) + A +// +{ .mmi +(p11) add table_ptr1 = 224, table_ptr1 ;; +(p11) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} +{ .mfi +(p11) ldfe P1_2 = [table_ptr1], 128 +// +// Case 4: w = U_lo + V_lo + w +// +(p12) fsub.s1 a = a, V_hi + nop.i 999 ;; +} +// +// Case 4: r = C_hi + C_lo +// +{ .mfi +(p11) ldfe Q1_1 = [table_ptr1], 16 +(p11) fadd.s1 C_lo = C_lo, A + nop.i 999 ;; +} +// +// Case 4: c = C_hi - r +// Get [i_1] - lsb of N_fix_gr. +// +{ .mfi +(p11) ldfe Q1_2 = [table_ptr1], 16 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p13) fsub.s1 a = U_hi, a + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fadd.s1 t = t, a + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: t = t + a +// +(p11) fadd.s1 C_lo = C_lo, t + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 4: C_lo = C_lo + t +// +(p11) fadd.s1 r = C_hi, C_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fsub.s1 c = C_hi, r + nop.i 999 +} +{ .mfi + nop.m 999 +// +// Case 4: c = c + C_lo finished. +// Is i_1 even or odd? +// if i_1 == 0, set PR_4, else set PR_5. +// +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fadd.s1 c = c , C_lo +(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; +} +{ .mfi + nop.m 999 +(p12) frcpa.s1 S_hi, p0 = f1, r + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N odd: Change sign of S_hi +// +(p11) fma.s1 Result = rsq, P1_2, P1_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 P = rsq, Q1_2, Q1_1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N odd: Result = S_hi + S_lo (User supplied rounding mode for C1) +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: rsq = r * r +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N even: rsq = rsq * P1_2 + P1_1 +// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary +// +(p11) fmpy.s1 Result = rsq, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r,f1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N even: Result = Result * rsq +// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary +// +(p11) fma.s1 Result = r, Result, c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// +(p11) fadd.s0 Result= r, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = Result * r + c +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result1 = Result + r (Rounding mode S0) +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * poly + S_hi 64 bits +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: S_lo = S_hi * poly1 +// +(p12) fma.s1 S_lo = P, r, S_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// N odd: S_lo = S_lo + r * P +// +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; +} + + +L(TANL_SMALL_R): +{ .mii + nop.m 999 +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; +} +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p12) frcpa.s1 S_hi, p0 = f1, r + nop.i 999 +} +;; + + +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + +// ***************************************************************** +// ***************************************************************** +// ***************************************************************** + + +{ .mmi +(p0) add table_ptr1 = 224, table_ptr1 ;; +(p0) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-2) +{ .mfi +(p0) ldfe P1_2 = [table_ptr1], 16 +(p11) fmpy.s1 r_to_the_8 = rsq, rsq + nop.i 999 ;; +} +// +// Set table_ptr1 to beginning of constant table. +// Get [i_1] - lsb of N_fix_gr. +// +{ .mfi +(p0) ldfe P1_3 = [table_ptr1], 96 +// +// N even: rsq = r * r +// N odd: S_hi = frcpa(r) +// +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 ;; +} +// +// Is i_1 even or odd? +// if i_1 == 0, set PR_11. +// if i_1 != 0, set PR_12. +// +{ .mfi +(p11) ldfe P1_9 = [table_ptr1], -16 +// +// N even: Poly2 = P1_7 + Poly2 * rsq +// N odd: poly2 = Q1_5 + poly2 * rsq +// +(p11) fadd.s1 CORR = rsq, f1 + nop.i 999 ;; +} +{ .mmi +(p11) ldfe P1_8 = [table_ptr1], -16 ;; +// +// N even: Poly1 = P1_2 + P1_3 * rsq +// N odd: poly1 = 1.0 + S_hi * r +// 16 bits partial account for necessary (-1) +// +(p11) ldfe P1_7 = [table_ptr1], -16 + nop.i 999 ;; +} +// +// N even: Poly1 = P1_1 + Poly1 * rsq +// N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary +// +{ .mfi +(p11) ldfe P1_6 = [table_ptr1], -16 +// +// N even: Poly2 = P1_5 + Poly2 * rsq +// N odd: poly2 = Q1_3 + poly2 * rsq +// +(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8 + nop.i 999 ;; +} +// +// N even: Poly1 = Poly1 * rsq +// N odd: poly1 = 1.0 + S_hi * r 32 bits partial +// +{ .mfi +(p11) ldfe P1_5 = [table_ptr1], -16 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} + +// +// N even: CORR = CORR * c +// N odd: S_hi = S_hi * poly1 + S_hi 32 bits +// + +// +// N even: Poly2 = P1_6 + Poly2 * rsq +// N odd: poly2 = Q1_4 + poly2 * rsq +// + +{ .mmf +(p11) ldfe P1_4 = [table_ptr1], -16 +(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp +(p11) fmpy.s1 CORR = CORR, c +} +;; + + +{ .mmi +(p0) ld8 table_ptr2 = [table_ptr2] + nop.m 999 + nop.i 999 +} +;; + + +{ .mii +(p0) add table_ptr2 = 464, table_ptr2 + nop.i 999 ;; + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fma.s1 Poly1 = P1_3, rsq, P1_2 + nop.i 999 ;; +} +{ .mfi +(p0) ldfe Q1_7 = [table_ptr2], -16 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; +} +{ .mfi +(p0) ldfe Q1_6 = [table_ptr2], -16 +(p11) fma.s1 Poly2 = P1_9, rsq, P1_8 + nop.i 999 ;; +} +{ .mmi +(p0) ldfe Q1_5 = [table_ptr2], -16 ;; +(p12) ldfe Q1_4 = [table_ptr2], -16 + nop.i 999 ;; +} +{ .mfi +(p12) ldfe Q1_3 = [table_ptr2], -16 +// +// N even: Poly2 = P1_8 + P1_9 * rsq +// N odd: poly2 = Q1_6 + Q1_7 * rsq +// +(p11) fma.s1 Poly1 = Poly1, rsq, P1_1 + nop.i 999 ;; +} +{ .mfi +(p12) ldfe Q1_2 = [table_ptr2], -16 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} +{ .mfi +(p12) ldfe Q1_1 = [table_ptr2], -16 +(p11) fma.s1 Poly2 = Poly2, rsq, P1_7 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: CORR = rsq + 1 +// N even: r_to_the_8 = rsq * rsq +// +(p11) fmpy.s1 Poly1 = Poly1, rsq + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fma.s1 Poly2 = Poly2, rsq, P1_6 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_5 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p11) fma.s1 Poly2= Poly2, rsq, P1_5 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: r_to_the_8 = r_to_the_8 * r_to_the_8 +// N odd: poly1 = S_hi * r + 1.0 64 bits partial +// +(p11) fma.s1 Poly2 = Poly2, rsq, P1_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = CORR + Poly * r +// N odd: P = Q1_1 + poly2 * rsq +// +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_3 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Poly2 = P1_4 + Poly2 * rsq +// N odd: poly2 = Q1_2 + poly2 * rsq +// +(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 poly2 = poly2, rsq, Q1_2 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// N even: Poly = Poly1 + Poly2 * r_to_the_8 +// N odd: S_hi = S_hi * poly1 + S_hi 64 bits +// +(p11) fma.s1 Result = Poly, r, CORR + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = r + Result (User supplied rounding mode) +// N odd: poly1 = S_hi * c + poly1 +// +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fma.s1 P = poly2, rsq, Q1_1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: poly1 = S_hi * r + 1.0 +// +// +// N odd: S_lo = S_hi * poly1 +// +(p11) fadd.s0 Result = Result, r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: S_lo = Q1_1 * c + S_lo +// +(p12) fma.s1 S_lo = Q1_1, c, S_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: Result = S_lo + r * P +// +(p12) fma.s1 Result = P, r, S_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +// +// N odd: Result = Result + S_hi (user supplied rounding mode) +// +(p12) fadd.s0 Result = Result, S_hi +(p0) br.ret.sptk b0 ;; +} + + +L(TANL_NORMAL_R): +{ .mfi +(p0) getf.sig sig_r = r +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// r and c have been computed. +// Make sure ftz mode is set - should be automatic when using wre +// +// +// Get [i_1] - lsb of N_fix_gr alone. +// +(p0) fmerge.s Pos_r = f1, r +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +} +{ .mfi + nop.m 999 +(p0) fmerge.s sgn_r = r, f1 +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; +} +{ .mfi + nop.m 999 + nop.f 999 +(p0) extr.u lookup = sig_r, 58, 5 +} +{ .mlx + nop.m 999 +(p0) movl Create_B = 0x8200000000000000 ;; +} +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp + nop.f 999 +(p0) dep Create_B = lookup, Create_B, 58, 5 +} +;; + + +// +// Get [i_1] - lsb of N_fix_gr alone. +// Pos_r = abs (r) +// + + +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 +} +;; + + +{ .mmi + nop.m 999 +(p0) setf.sig B = Create_B +// +// Set table_ptr1 and table_ptr2 to base address of +// constant table. +// +(p0) add table_ptr1 = 480, table_ptr1 ;; +} +{ .mmb + nop.m 999 +// +// Is i_1 or i_0 == 0 ? +// Create the constant 1 00000 1000000000000000000000... +// +(p0) ldfe P2_1 = [table_ptr1], 16 + nop.b 999 +} +{ .mmi + nop.m 999 ;; +(p0) getf.exp exp_r = Pos_r + nop.i 999 +} +// +// Get r's exponent +// Get r's significand +// +{ .mmi +(p0) ldfe P2_2 = [table_ptr1], 16 ;; +// +// Get the 5 bits or r for the lookup. 1.xxxxx .... +// from sig_r. +// Grab lsb of exp of B +// +(p0) ldfe P2_3 = [table_ptr1], 16 + nop.i 999 ;; +} +{ .mii + nop.m 999 +(p0) andcm table_offset = 0x0001, exp_r ;; +(p0) shl table_offset = table_offset, 9 ;; +} +{ .mii + nop.m 999 +// +// Deposit 0 00000 1000000000000000000000... on +// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy..., +// getting rid of the ys. +// Is B = 2** -2 or B= 2** -1? If 2**-1, then +// we want an offset of 512 for table addressing. +// +(p0) shladd table_offset = lookup, 4, table_offset ;; +// +// B = ........ 1xxxxx 1000000000000000000... +// +(p0) add table_ptr1 = table_ptr1, table_offset ;; +} +{ .mmb + nop.m 999 +// +// B = ........ 1xxxxx 1000000000000000000... +// Convert B so it has the same exponent as Pos_r +// +(p0) ldfd T_hi = [table_ptr1], 8 + nop.b 999 ;; +} + + + +// +// x = |r| - B +// Load T_hi. +// Load C_hi. +// + +{ .mmf +(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp +(p0) ldfs T_lo = [table_ptr1] +(p0) fmerge.se B = Pos_r, B +} +;; + + +{ .mmi +(p0) ld8 table_ptr2 = [table_ptr2] + nop.m 999 + nop.i 999 +} +;; + + +{ .mii +(p0) add table_ptr2 = 1360, table_ptr2 + nop.i 999 ;; +(p0) add table_ptr2 = table_ptr2, table_offset ;; +} +{ .mfi +(p0) ldfd C_hi = [table_ptr2], 8 +(p0) fsub.s1 x = Pos_r, B + nop.i 999 ;; +} +{ .mii +(p0) ldfs C_lo = [table_ptr2],255 + nop.i 999 ;; +// +// xsq = x * x +// N even: Tx = T_hi * x +// Load T_lo. +// Load C_lo - increment pointer to get SC_inv +// - cant get all the way, do an add later. +// +(p0) add table_ptr2 = 569, table_ptr2 ;; +} +// +// N even: Tx1 = Tx + 1 +// N odd: Cx1 = 1 - Cx +// +{ .mfi +(p0) ldfe SC_inv = [table_ptr2], 0 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 xsq = x, x + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fmpy.s1 Tx = T_hi, x + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fmpy.s1 Cx = C_hi, x + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: Cx = C_hi * x +// +(p0) fma.s1 P = P2_3, xsq, P2_2 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N even and odd: P = P2_3 + P2_2 * xsq +// +(p11) fadd.s1 Tx1 = Tx, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: D = C_hi - tanx +// N odd: D = T_hi + tanx +// +(p11) fmpy.s1 CORR = SC_inv, T_hi + nop.i 999 +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 Sx = SC_inv, x + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fmpy.s1 CORR = SC_inv, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fsub.s1 V_hi = f1, Cx + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 P = P, xsq, P2_1 + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N even and odd: P = P2_1 + P * xsq +// +(p11) fma.s1 V_hi = Tx, Tx1, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: Result = sgn_r * tail + T_hi (user rounding mode for C1) +// N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1) +// +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 CORR = CORR, c + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fnma.s1 V_hi = Cx,V_hi,f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: V_hi = Tx * Tx1 + 1 +// N odd: Cx1 = 1 - Cx * Cx1 +// +(p0) fmpy.s1 P = P, xsq + nop.i 999 +} +{ .mfi + nop.m 999 +// +// N even and odd: P = P * xsq +// +(p11) fmpy.s1 V_hi = V_hi, T_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: tail = P * tail + V_lo +// +(p11) fmpy.s1 T_hi = sgn_r, T_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fmpy.s1 CORR = CORR, sgn_r + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p12) fmpy.s1 V_hi = V_hi,C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: V_hi = T_hi * V_hi +// N odd: V_hi = C_hi * V_hi +// +(p0) fma.s1 tanx = P, x, x + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fnmpy.s1 C_hi = sgn_r, C_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: V_lo = 1 - V_hi + C_hi +// N odd: V_lo = 1 - V_hi + T_hi +// +(p11) fadd.s1 CORR = CORR, T_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fsub.s1 CORR = CORR, C_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: tanx = x + x * P +// N even and odd: Sx = SC_inv * x +// +(p11) fsub.s1 D = C_hi, tanx + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fadd.s1 D = T_hi, tanx + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N odd: CORR = SC_inv * C_hi +// N even: CORR = SC_inv * T_hi +// +(p0) fnma.s1 D = V_hi, D, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: D = 1 - V_hi * D +// N even and odd: CORR = CORR * c +// +(p0) fma.s1 V_hi = V_hi, D, V_hi + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: V_hi = V_hi + V_hi * D +// N even and odd: CORR = sgn_r * CORR +// +(p11) fnma.s1 V_lo = V_hi, C_hi, f1 + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = V_hi, T_hi, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: CORR = COOR + T_lo +// N odd: CORR = CORR - C_lo +// +(p11) fma.s1 V_lo = tanx, V_hi, V_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = tanx, V_hi, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: V_lo = V_lo + V_hi * tanx +// N odd: V_lo = V_lo - V_hi * tanx +// +(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo + nop.i 999 +} +{ .mfi + nop.m 999 +(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: V_lo = V_lo - V_hi * C_lo +// N odd: V_lo = V_lo - V_hi * T_lo +// +(p0) fmpy.s1 V_lo = V_hi, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: V_lo = V_lo * V_hi +// +(p0) fadd.s1 tail = V_hi, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: tail = V_hi + V_lo +// +(p0) fma.s1 tail = tail, P, V_lo + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even: T_hi = sgn_r * T_hi +// N odd : C_hi = -sgn_r * C_hi +// +(p0) fma.s1 tail = tail, Sx, CORR + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even and odd: tail = Sx * tail + CORR +// +(p0) fma.s1 tail = V_hi, Sx, tail + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// N even an odd: tail = Sx * V_hi + tail +// +(p11) fma.s0 Result = sgn_r, tail, T_hi + nop.i 999 +} +{ .mfb + nop.m 999 +(p12) fma.s0 Result = sgn_r, tail, C_hi +(p0) br.ret.sptk b0 ;; +} + +L(TANL_SPECIAL): +{ .mfb + nop.m 999 +(p0) fmpy.s0 Arg = Arg, f0 +(p0) br.ret.sptk b0 ;; +} +// +// Code for NaNs, Unsupporteds, Infs, or +/- zero ? +// Invalid raised for Infs and SNaNs. +// + +.endp tanl +ASM_SIZE_DIRECTIVE(tanl) + +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// Special Code to handle very large argument case. +// Call int pi_by_2_reduce(&x,&r,&c) +// for |arguments| >= 2**63 +// (Arg or x) is in f8 +// Address to save r and c as double +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* + +.proc __libm_callout +__libm_callout: +L(TANL_ARG_TOO_LARGE): +.prologue +{ .mfi + add r50=-32,sp // Parameter: r address + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; +{ .mmi + stfe [r50] = f0,16 // Clear Parameter r on stack + add r49 = 16,sp // Parameter x address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; +.body +{ .mib + stfe [r50] = f0,-16 // Clear Parameter c on stack + nop.i 0 + nop.b 0 +} +{ .mib + stfe [r49] = Arg // Store Parameter x on stack + nop.i 0 +(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; +};; +// +// Load 2^-2 +// +{ .mmi +(p0) ldfe Arg =[r49],16 +// +// Call argument reduction +// +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 +// Get Arg off stack +// Get r off stack - hi order part +// Get c off stack - lo order part +(p0) mov N_fix_gr = r8 ;; +} +{ .mmb +(p0) ldfe r =[r50],16 +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4 + nop.b 999 ;; +} +{ .mfi +(p0) ldfe c =[r50],-32 + nop.f 999 + nop.i 999 ;; +} +{ .mfi +.restore sp + add sp = 64,sp // Restore stack pointer +// +// Is |r| < 2**(-2) +// +(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2 +mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mfi + mov gp = GR_SAVE_GP // Restore gp +(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 + mov ar.pfs = GR_SAVE_PFS // Restore gp +};; +{ .mbb + nop.m 999 +(p6) br.cond.spnt L(TANL_SMALL_R) +(p0) br.cond.sptk L(TANL_NORMAL_R) ;; +} + +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) + +.type __libm_pi_by_2_reduce#,@function +.global __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/s_trunc.S b/sysdeps/ia64/fpu/s_trunc.S new file mode 100644 index 0000000000..976ddf1517 --- /dev/null +++ b/sysdeps/ia64/fpu/s_trunc.S @@ -0,0 +1,188 @@ +.file "trunc.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global trunc# + +.section .text +.proc trunc# +.align 32 + +// History +//============================================================== +// 7/7/00: Created +//============================================================== + +// API +//============================================================== +// double trunc(double x) +//============================================================== + +#include "libm_support.h" + +// general input registers: +TRUNC_GR_FFFF = r14 +TRUNC_GR_signexp = r15 +TRUNC_GR_exponent = r16 +TRUNC_GR_expmask = r17 +TRUNC_GR_bigexp = r18 + +// floating-point registers: +// f8, f9, f11, f12 + +// predicate registers used: +// p6, p7, p8, p9, p10, p11 + +// Overview of operation +//============================================================== +// double trunc(double x) +// Return an integer value (represented as a double) less than or +// equal to x in magnitude. +// This is x rounded toward zero to an integral value. +//============================================================== + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +trunc: + +{ .mfi + getf.exp TRUNC_GR_signexp = f8 + fcvt.fx.trunc.s1 f9 = f8 + addl TRUNC_GR_bigexp = 0x10033, r0 +} +{ .mfi + mov TRUNC_GR_FFFF = 0x0FFFF + fnorm.d f11 = f8 + mov TRUNC_GR_expmask = 0x1FFFF +};; +// get the exponent of x +// convert x to integer in signficand of f9 +// Normalize x - this will raise invalid on SNaNs, the +// denormal operand flag - and possibly a spurious U flag +// get exponent only mask (will exclude sign bit) + +{ .mfi + nop.m 0 + fclass.m p7,p8 = f8, 0x0b + nop.i 0 +} +{ .mfi + nop.m 0 + fcmp.eq.unc.s1 p9,p0 = f8,f0 + nop.i 0 +};; +// fclass to set p7 if unnorm +{ .mmi + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; +(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp +(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp +};; +// Get the exponent of x +// Test if exponent such that result already an integer +// Test if x < 0 +{ .mmi +(p9) cmp.eq.andcm p10,p11 = r0, r0 +(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF + nop.i 0 +};; +// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 +{ .mfb +(p6) cmp.eq.andcm p10,p11 = r0, r0 +(p6) fmerge.s f8 = f8, f0 + nop.b 0 +};; +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +.pred.rel "mutex",p10,p11 +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f9 + nop.b 0 +} +{ .mfb + nop.m 0 +(p10) fma.d.s1 f8 = f11,f1,f0 +(p8) br.ret.sptk b0 +};; +// If not a unnorm and not an big int, nan,or +/-inf convert signficand +// back to f8. +// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x +// If not a unorm, Return +// If unnorm, get the exponent again - perhaps it wasn't a denorm. +{ .mfb +(p7) getf.exp TRUNC_GR_signexp = f11 +(p7) fcvt.fx.trunc.s1 f12 = f11 + nop.b 0 +};; +{ .mfb + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask + fcmp.lt.unc.s1 p9,p0 = f8,f0 + nop.b 0 +};; +{ .mfb + cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp + nop.f 0 + nop.b 0 +};; +// If a unnorm, check to see if value is already a big int. +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f12 + nop.b 0 +} +{ .mfi + nop.m 0 +(p10) fma.d.s1 f8 = f11,f1,f0 + nop.i 0 +};; +{ .mfb + nop.m 0 +(p9) fmerge.ns f8 = f1,f8 + br.ret.sptk b0 +};; +// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) +// Make sure the result is negative if it should be - that is +// negative(denormal) -> -0. +.endp trunc +ASM_SIZE_DIRECTIVE(trunc) diff --git a/sysdeps/ia64/fpu/s_truncf.S b/sysdeps/ia64/fpu/s_truncf.S new file mode 100644 index 0000000000..10364052ad --- /dev/null +++ b/sysdeps/ia64/fpu/s_truncf.S @@ -0,0 +1,188 @@ +.file "truncf.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global truncf# + +.section .text +.proc truncf# +.align 32 + +// History +//============================================================== +// 7/7/00: Created +//============================================================== + +// API +//============================================================== +// float truncf(float x) +//============================================================== + +#include "libm_support.h" + +// general input registers: +TRUNC_GR_FFFF = r14 +TRUNC_GR_signexp = r15 +TRUNC_GR_exponent = r16 +TRUNC_GR_expmask = r17 +TRUNC_GR_bigexp = r18 + +// floating-point registers: +// f8, f9, f11, f12 + +// predicate registers used: +// p6, p7, p8, p9, p10, p11 + +// Overview of operation +//============================================================== +// float truncf(float x) +// Return an integer value (represented as a float) less than or +// equal to x in magnitude. +// This is x rounded toward zero to an integral value. +//============================================================== + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +truncf: + +{ .mfi + getf.exp TRUNC_GR_signexp = f8 + fcvt.fx.trunc.s1 f9 = f8 + addl TRUNC_GR_bigexp = 0x10016, r0 +} +{ .mfi + mov TRUNC_GR_FFFF = 0x0FFFF + fnorm.s f11 = f8 + mov TRUNC_GR_expmask = 0x1FFFF +};; +// get the exponent of x +// convert x to integer in signficand of f9 +// Normalize x - this will raise invalid on SNaNs, the +// denormal operand flag - and possibly a spurious U flag +// get exponent only mask (will exclude sign bit) + +{ .mfi + nop.m 0 + fclass.m p7,p8 = f8, 0x0b + nop.i 0 +} +{ .mfi + nop.m 0 + fcmp.eq.unc.s1 p9,p0 = f8,f0 + nop.i 0 +};; +// fclass to set p7 if unnorm +{ .mmi + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; +(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp +(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp +};; +// Get the exponent of x +// Test if exponent such that result already an integer +// Test if x < 0 +{ .mmi +(p9) cmp.eq.andcm p10,p11 = r0, r0 +(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF + nop.i 0 +};; +// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 +{ .mfb +(p6) cmp.eq.andcm p10,p11 = r0, r0 +(p6) fmerge.s f8 = f8, f0 + nop.b 0 +};; +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +.pred.rel "mutex",p10,p11 +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f9 + nop.b 0 +} +{ .mfb + nop.m 0 +(p10) fma.s.s1 f8 = f11,f1,f0 +(p8) br.ret.sptk b0 +};; +// If not a unnorm and not an big int, nan,or +/-inf convert signficand +// back to f8. +// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x +// If not a unorm, Return +// If unnorm, get the exponent again - perhaps it wasn't a denorm. +{ .mfb +(p7) getf.exp TRUNC_GR_signexp = f11 +(p7) fcvt.fx.trunc.s1 f12 = f11 + nop.b 0 +};; +{ .mfb + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask + fcmp.lt.unc.s1 p9,p0 = f8,f0 + nop.b 0 +};; +{ .mfb + cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp + nop.f 0 + nop.b 0 +};; +// If a unnorm, check to see if value is already a big int. +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f12 + nop.b 0 +} +{ .mfi + nop.m 0 +(p10) fma.s.s1 f8 = f11,f1,f0 + nop.i 0 +};; +{ .mfb + nop.m 0 +(p9) fmerge.ns f8 = f1,f8 + br.ret.sptk b0 +};; +// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) +// Make sure the result is negative if it should be - that is +// negative(denormal) -> -0. +.endp truncf +ASM_SIZE_DIRECTIVE(truncf) diff --git a/sysdeps/ia64/fpu/s_truncl.S b/sysdeps/ia64/fpu/s_truncl.S new file mode 100644 index 0000000000..aca64b958a --- /dev/null +++ b/sysdeps/ia64/fpu/s_truncl.S @@ -0,0 +1,188 @@ +.file "truncl.s" + +// Copyright (c) 2000, 2001, Intel Corporation +// All rights reserved. +// +// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, +// Bob Norin, Shane Story, and Ping Tak Peter Tang of the +// Computational Software Lab, Intel Corporation. +// +// WARRANTY DISCLAIMER +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +.align 32 +.global truncl# + +.section .text +.proc truncl# +.align 32 + +// History +//============================================================== +// 7/7/00: Created +//============================================================== + +// API +//============================================================== +// long double truncl(float x) +//============================================================== + +#include "libm_support.h" + +// general input registers: +TRUNC_GR_FFFF = r14 +TRUNC_GR_signexp = r15 +TRUNC_GR_exponent = r16 +TRUNC_GR_expmask = r17 +TRUNC_GR_bigexp = r18 + +// floating-point registers: +// f8, f9, f11, f12 + +// predicate registers used: +// p6, p7, p8, p9, p10, p11 + +// Overview of operation +//============================================================== +// long double truncl(long double x) +// Return an integer value (represented as a long double) less than or +// equal to x in magnitude. +// This is x rounded toward zero to an integral value. +//============================================================== + +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. + +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. + +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. + +truncl: + +{ .mfi + getf.exp TRUNC_GR_signexp = f8 + fcvt.fx.trunc.s1 f9 = f8 + addl TRUNC_GR_bigexp = 0x1003e, r0 +} +{ .mfi + mov TRUNC_GR_FFFF = 0x0FFFF + fnorm f11 = f8 + mov TRUNC_GR_expmask = 0x1FFFF +};; +// get the exponent of x +// convert x to integer in signficand of f9 +// Normalize x - this will raise invalid on SNaNs, the +// denormal operand flag - and possibly a spurious U flag +// get exponent only mask (will exclude sign bit) + +{ .mfi + nop.m 0 + fclass.m p7,p8 = f8, 0x0b + nop.i 0 +} +{ .mfi + nop.m 0 + fcmp.eq.unc.s1 p9,p0 = f8,f0 + nop.i 0 +};; +// fclass to set p7 if unnorm +{ .mmi + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; +(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp +(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp +};; +// Get the exponent of x +// Test if exponent such that result already an integer +// Test if x < 0 +{ .mmi +(p9) cmp.eq.andcm p10,p11 = r0, r0 +(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF + nop.i 0 +};; +// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 +{ .mfb +(p6) cmp.eq.andcm p10,p11 = r0, r0 +(p6) fmerge.s f8 = f8, f0 + nop.b 0 +};; +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +// If not a unnorm, set p10 if x already is a big int, nan, or inf? +.pred.rel "mutex",p10,p11 +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f9 + nop.b 0 +} +{ .mfb + nop.m 0 +(p10) fma.s1 f8 = f11,f1,f0 +(p8) br.ret.sptk b0 +};; +// If not a unnorm and not an big int, nan,or +/-inf convert signficand +// back to f8. +// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x +// If not a unorm, Return +// If unnorm, get the exponent again - perhaps it wasn't a denorm. +{ .mfb +(p7) getf.exp TRUNC_GR_signexp = f11 +(p7) fcvt.fx.trunc.s1 f12 = f11 + nop.b 0 +};; +{ .mfb + and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask + fcmp.lt.unc.s1 p9,p0 = f8,f0 + nop.b 0 +};; +{ .mfb + cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp + nop.f 0 + nop.b 0 +};; +// If a unnorm, check to see if value is already a big int. +{ .mfb + nop.m 0 +(p11) fcvt.xf f8 = f12 + nop.b 0 +} +{ .mfi + nop.m 0 +(p10) fma.s1 f8 = f11,f1,f0 + nop.i 0 +};; +{ .mfb + nop.m 0 +(p9) fmerge.ns f8 = f1,f8 + br.ret.sptk b0 +};; +// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) +// Make sure the result is negative if it should be - that is +// negative(denormal) -> -0. +.endp truncl +ASM_SIZE_DIRECTIVE(truncl) diff --git a/sysdeps/ia64/fpu/w_acos.c b/sysdeps/ia64/fpu/w_acos.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_acos.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_acosf.c b/sysdeps/ia64/fpu/w_acosf.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/sysdeps/ia64/fpu/w_acosf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_acosl.c b/sysdeps/ia64/fpu/w_acosl.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/sysdeps/ia64/fpu/w_acosl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_asin.c b/sysdeps/ia64/fpu/w_asin.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_asin.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_asinf.c b/sysdeps/ia64/fpu/w_asinf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_asinf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_asinl.c b/sysdeps/ia64/fpu/w_asinl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_asinl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_atan2.c b/sysdeps/ia64/fpu/w_atan2.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_atan2.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_atan2f.c b/sysdeps/ia64/fpu/w_atan2f.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_atan2f.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_atan2l.c b/sysdeps/ia64/fpu/w_atan2l.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_atan2l.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_cosh.c b/sysdeps/ia64/fpu/w_cosh.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_cosh.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_coshf.c b/sysdeps/ia64/fpu/w_coshf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_coshf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_coshl.c b/sysdeps/ia64/fpu/w_coshl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_coshl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_exp.c b/sysdeps/ia64/fpu/w_exp.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_exp.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_expf.c b/sysdeps/ia64/fpu/w_expf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_expf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_fmod.c b/sysdeps/ia64/fpu/w_fmod.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_fmod.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_fmodf.c b/sysdeps/ia64/fpu/w_fmodf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_fmodf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_fmodl.c b/sysdeps/ia64/fpu/w_fmodl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_fmodl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_hypot.c b/sysdeps/ia64/fpu/w_hypot.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_hypot.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_hypotf.c b/sysdeps/ia64/fpu/w_hypotf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_hypotf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_hypotl.c b/sysdeps/ia64/fpu/w_hypotl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_hypotl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_log.c b/sysdeps/ia64/fpu/w_log.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_log.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_log10.c b/sysdeps/ia64/fpu/w_log10.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_log10.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_log10f.c b/sysdeps/ia64/fpu/w_log10f.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_log10f.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_log10l.c b/sysdeps/ia64/fpu/w_log10l.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_log10l.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_logf.c b/sysdeps/ia64/fpu/w_logf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_logf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_logl.c b/sysdeps/ia64/fpu/w_logl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_logl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_pow.c b/sysdeps/ia64/fpu/w_pow.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_pow.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_powf.c b/sysdeps/ia64/fpu/w_powf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_powf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_powl.c b/sysdeps/ia64/fpu/w_powl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_powl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_remainder.c b/sysdeps/ia64/fpu/w_remainder.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_remainder.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_remainderf.c b/sysdeps/ia64/fpu/w_remainderf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_remainderf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_remainderl.c b/sysdeps/ia64/fpu/w_remainderl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_remainderl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_scalb.c b/sysdeps/ia64/fpu/w_scalb.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_scalb.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_scalbf.c b/sysdeps/ia64/fpu/w_scalbf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_scalbf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_scalbl.c b/sysdeps/ia64/fpu/w_scalbl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_scalbl.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_sqrt.c b/sysdeps/ia64/fpu/w_sqrt.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_sqrt.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_sqrtf.c b/sysdeps/ia64/fpu/w_sqrtf.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_sqrtf.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/w_sqrtl.c b/sysdeps/ia64/fpu/w_sqrtl.c new file mode 100644 index 0000000000..41254ae60a --- /dev/null +++ b/sysdeps/ia64/fpu/w_sqrtl.c @@ -0,0 +1 @@ +/* Not needed. */ |