From 1bead169c32a3a688de863709b863207b7aafddd Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Wed, 28 Nov 2012 13:40:54 +0000 Subject: Fix powl inaccuracy for x86_64 and x86 (bug 13881). --- sysdeps/i386/fpu/e_powl.S | 55 +++++----- sysdeps/i386/fpu/libm-test-ulps | 9 ++ sysdeps/x86/fpu/Makefile | 3 + sysdeps/x86/fpu/powl_helper.c | 211 ++++++++++++++++++++++++++++++++++++++ sysdeps/x86_64/fpu/e_powl.S | 52 ++++------ sysdeps/x86_64/fpu/libm-test-ulps | 4 + 6 files changed, 274 insertions(+), 60 deletions(-) create mode 100644 sysdeps/x86/fpu/Makefile create mode 100644 sysdeps/x86/fpu/powl_helper.c (limited to 'sysdeps') diff --git a/sysdeps/i386/fpu/e_powl.S b/sysdeps/i386/fpu/e_powl.S index ac4842cf63..7e297756ff 100644 --- a/sysdeps/i386/fpu/e_powl.S +++ b/sysdeps/i386/fpu/e_powl.S @@ -26,9 +26,9 @@ .type one,@object one: .double 1.0 ASM_SIZE_DIRECTIVE(one) - .type limit,@object -limit: .double 0.29 - ASM_SIZE_DIRECTIVE(limit) + .type p3,@object +p3: .byte 0, 0, 0, 0, 0, 0, 0x20, 0x40 + ASM_SIZE_DIRECTIVE(p3) .type p63,@object p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 ASM_SIZE_DIRECTIVE(p63) @@ -141,7 +141,15 @@ ENTRY(__ieee754_powl) fchs // -0x1p-79 : x jmp 3f -9: /* OK, we have an integer value for y. */ +9: /* OK, we have an integer value for y. Unless very small + (we use < 8), use the algorithm for real exponent to avoid + accumulation of errors. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p3) // y : x + fnstsw + sahf + jnc 2f popl %eax cfi_adjust_cfa_offset (-4) popl %edx @@ -182,7 +190,7 @@ ENTRY(__ieee754_powl) cfi_adjust_cfa_offset (8) .align ALIGNARG(4) -2: // y is a large integer (absolute value at least 1L<<63), but +2: // y is a large integer (absolute value at least 8), but // may be odd unless at least 1L<<64. So it may be necessary // to adjust the sign of a negative result afterwards. fxch // x : y @@ -205,34 +213,21 @@ ENTRY(__ieee754_powl) fchs // -(1L<<78) : |x| .align ALIGNARG(4) 3: /* y is a real number. */ - fxch // x : y - fldl MO(one) // 1.0 : x : y - fldl MO(limit) // 0.29 : 1.0 : x : y - fld %st(2) // x : 0.29 : 1.0 : x : y - fsub %st(2) // x-1 : 0.29 : 1.0 : x : y - fabs // |x-1| : 0.29 : 1.0 : x : y - fucompp // 1.0 : x : y - fnstsw - fxch // x : 1.0 : y - sahf - ja 7f - fsub %st(1) // x-1 : 1.0 : y - fyl2xp1 // log2(x) : y - jmp 8f - -7: fyl2x // log2(x) : y -8: fmul %st(1) // y*log2(x) : y - fst %st(1) // y*log2(x) : y*log2(x) - frndint // int(y*log2(x)) : y*log2(x) - fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) - fxch // fract(y*log2(x)) : int(y*log2(x)) - f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x)) - faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x)) - fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x)) - fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x)) + subl $28, %esp + cfi_adjust_cfa_offset (28) + fstpt 12(%esp) // x + fstpt (%esp) // + mov %edx, 24(%esp) + call HIDDEN_JUMPTARGET (__powl_helper) // + mov 24(%esp), %edx + addl $28, %esp + cfi_adjust_cfa_offset (-28) testb $2, %dh jz 292f // x is negative. If y is an odd integer, negate the result. +#ifdef PIC + LOAD_PIC_REG (cx) +#endif fldt 24(%esp) // y : abs(result) fld %st // y : y : abs(result) fabs // |y| : y : abs(result) diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps index 8be00860c9..5b595bc566 100644 --- a/sysdeps/i386/fpu/libm-test-ulps +++ b/sysdeps/i386/fpu/libm-test-ulps @@ -2480,6 +2480,11 @@ ifloat: 1 ildouble: 1 ldouble: 1 +# pow +Test "pow (0x0.ffffffp0, -0x1p24) == 2.7182819094701610539628664526874952929416": +ildouble: 1 +ldouble: 1 + # pow_downward Test "pow_downward (1.0625, 1.125) == 1.070582293028761362162622578677070098674": double: 1 @@ -3782,6 +3787,10 @@ ifloat: 1 ildouble: 1 ldouble: 1 +Function: "pow": +ildouble: 1 +ldouble: 1 + Function: "pow_downward": double: 1 float: 1 diff --git a/sysdeps/x86/fpu/Makefile b/sysdeps/x86/fpu/Makefile new file mode 100644 index 0000000000..8054380477 --- /dev/null +++ b/sysdeps/x86/fpu/Makefile @@ -0,0 +1,3 @@ +ifeq ($(subdir),math) +libm-support += powl_helper +endif diff --git a/sysdeps/x86/fpu/powl_helper.c b/sysdeps/x86/fpu/powl_helper.c new file mode 100644 index 0000000000..3f69b08a1b --- /dev/null +++ b/sysdeps/x86/fpu/powl_helper.c @@ -0,0 +1,211 @@ +/* Implement powl for x86 using extra-precision log. + Copyright (C) 2012 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* High parts and low parts of -log (k/16), for integer k from 12 to + 24. */ + +static const long double powl_log_table[] = + { + 0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L, + 0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L, + 0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L, + 0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L, + -0x0p+0L, 0x0p+0L, + -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L, + -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L, + -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L, + -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L, + -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L, + -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L, + -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L, + -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L, + }; + +/* High 32 bits of log2 (e), and remainder rounded to 64 bits. */ +static const long double log2e_hi = 0x1.71547652p+0L; +static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L; + +/* Given a number with high part HI and low part LO, add the number X + to it and store the result in *RHI and *RLO. It is given that + either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are + small enough that no overflow occurs. The result does not need to + be exact to 128 bits; 78-bit accuracy of the final accumulated + result suffices. */ + +static inline void +acc_split (long double *rhi, long double *rlo, long double hi, long double lo, + long double x) +{ + long double thi = hi + x; + long double tlo = (hi - thi) + x + lo; + *rhi = thi + tlo; + *rlo = (thi - *rhi) + tlo; +} + +extern long double __powl_helper (long double x, long double y); +libm_hidden_proto (__powl_helper) + +/* Given X a value that is finite and nonzero, or a NaN, and only + negative if Y is not an integer, and Y a finite nonzero value with + 0x1p-79 <= |Y| <= 0x1p78, compute X to the power Y. */ + +long double +__powl_helper (long double x, long double y) +{ + if (isnan (x) || x < 0) + return __ieee754_expl (y * __ieee754_logl (x)); + + /* We need to compute Y * log2 (X) to at least 64 bits after the + point for normal results (that is, to at least 78 bits + precision). */ + int x_int_exponent; + long double x_frac; + x_frac = __frexpl (x, &x_int_exponent); + if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */ + { + x_frac *= 2.0; + x_int_exponent--; + } + + long double log_x_frac_hi, log_x_frac_lo; + /* Determine an initial approximation to log (X_FRAC) using + POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an + interval (24/25, 26/25). */ + int k = (int) ((16.0L / x_frac) + 0.5L); + log_x_frac_hi = powl_log_table[2 * k - 24]; + log_x_frac_lo = powl_log_table[2 * k - 23]; + long double x_frac_low; + if (k == 16) + x_frac_low = 0.0L; + else + { + /* Mask off low 5 bits of X_FRAC so the multiplication by K/16 + is exact. These bits are small enough that they can be + corrected for by adding log2 (e) * X_FRAC_LOW to the final + result. */ + int32_t se; + u_int32_t i0, i1; + GET_LDOUBLE_WORDS (se, i0, i1, x_frac); + x_frac_low = x_frac; + i1 &= 0xffffffe0; + SET_LDOUBLE_WORDS (x_frac, se, i0, i1); + x_frac_low -= x_frac; + x_frac_low /= x_frac; + x_frac *= k / 16.0L; + } + + /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25). Separate + W = X_FRAC - 1 into high 16 bits and remaining bits, so that + multiplications for low-order power series terms are exact. The + remaining bits are small enough that adding a 64-bit value of + log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for + them. */ + long double w = x_frac - 1; + long double w_hi, w_lo; + int32_t se; + u_int32_t i0, i1; + GET_LDOUBLE_WORDS (se, i0, i1, w); + i0 &= 0xffff0000; + i1 = 0; + SET_LDOUBLE_WORDS (w_hi, se, i0, i1); + w_lo = w - w_hi; + long double wp = w_hi; + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp); + wp *= -w_hi; + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, + wp / 2.0L); + wp *= -w_hi; + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, + wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part. */ + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, + wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part. */ + wp *= -w_hi; + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, + wp / 4.0L); + /* Subsequent terms are small enough that they only need be computed + to 64 bits. */ + for (int i = 5; i <= 17; i++) + { + wp *= -w_hi; + acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, + wp / i); + } + + /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm. */ + long double log2_x_frac_hi, log2_x_frac_lo; + long double log_x_frac_hi32, log_x_frac_lo64; + GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi); + i1 = 0; + SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1); + log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo; + long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi; + long double log2_x_frac_lo1 + = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo; + log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1; + log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1; + + /* Correct for the masking off of W_LO. */ + long double log2_1p_w_lo; + asm ("fyl2xp1" + : "=t" (log2_1p_w_lo) + : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L) + : "st(1)"); + acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, + log2_1p_w_lo); + + /* Correct for the masking off of X_FRAC_LOW. */ + acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, + x_frac_low * M_LOG2El); + + /* Add the integer and fractional parts of the base-2 logarithm. */ + long double log2_x_hi, log2_x_lo; + log2_x_hi = x_int_exponent + log2_x_frac_hi; + log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo; + + /* Compute the base-2 logarithm of the result. */ + long double log2_res_hi, log2_res_lo; + long double log2_x_hi32, log2_x_lo64; + GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi); + i1 = 0; + SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1); + log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo; + long double y_hi32, y_lo32; + GET_LDOUBLE_WORDS (se, i0, i1, y); + i1 = 0; + SET_LDOUBLE_WORDS (y_hi32, se, i0, i1); + y_lo32 = y - y_hi32; + log2_res_hi = log2_x_hi32 * y_hi32; + log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y; + + /* Split the base-2 logarithm of the result into integer and + fractional parts. */ + long double log2_res_int = __roundl (log2_res_hi); + long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo; + + /* Compute the final result. */ + long double res; + asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac)); + res += 1.0L; + asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int)); + return res; +} + +libm_hidden_def (__powl_helper) diff --git a/sysdeps/x86_64/fpu/e_powl.S b/sysdeps/x86_64/fpu/e_powl.S index 1b3718522d..ff96cec68a 100644 --- a/sysdeps/x86_64/fpu/e_powl.S +++ b/sysdeps/x86_64/fpu/e_powl.S @@ -26,9 +26,9 @@ .type one,@object one: .double 1.0 ASM_SIZE_DIRECTIVE(one) - .type limit,@object -limit: .double 0.29 - ASM_SIZE_DIRECTIVE(limit) + .type p3,@object +p3: .byte 0, 0, 0, 0, 0, 0, 0x20, 0x40 + ASM_SIZE_DIRECTIVE(p3) .type p63,@object p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 ASM_SIZE_DIRECTIVE(p63) @@ -131,7 +131,15 @@ ENTRY(__ieee754_powl) fchs // -0x1p-79 : x jmp 3f -9: /* OK, we have an integer value for y. */ +9: /* OK, we have an integer value for y. Unless very small + (we use < 8), use the algorithm for real exponent to avoid + accumulation of errors. */ + fldl MO(p3) // 8 : y : x + fld %st(1) // y : 8 : y : x + fabs // |y| : 8 : y : x + fcomip %st(1), %st // 8 : y : x + fstp %st(0) // y : x + jnc 2f mov -8(%rsp),%eax mov -4(%rsp),%edx orl $0, %edx @@ -167,7 +175,7 @@ ENTRY(__ieee754_powl) ret .align ALIGNARG(4) -2: // y is a large integer (absolute value at least 1L<<63), but +2: // y is a large integer (absolute value at least 8), but // may be odd unless at least 1L<<64. So it may be necessary // to adjust the sign of a negative result afterwards. fxch // x : y @@ -190,31 +198,15 @@ ENTRY(__ieee754_powl) fchs // -(1L<<78) : |x| .align ALIGNARG(4) 3: /* y is a real number. */ - fxch // x : y - fldl MO(one) // 1.0 : x : y - fldl MO(limit) // 0.29 : 1.0 : x : y - fld %st(2) // x : 0.29 : 1.0 : x : y - fsub %st(2) // x-1 : 0.29 : 1.0 : x : y - fabs // |x-1| : 0.29 : 1.0 : x : y - fucompp // 1.0 : x : y - fnstsw - fxch // x : 1.0 : y - test $0x4500,%eax - jz 7f - fsub %st(1) // x-1 : 1.0 : y - fyl2xp1 // log2(x) : y - jmp 8f - -7: fyl2x // log2(x) : y -8: fmul %st(1) // y*log2(x) : y - fst %st(1) // y*log2(x) : y*log2(x) - frndint // int(y*log2(x)) : y*log2(x) - fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) - fxch // fract(y*log2(x)) : int(y*log2(x)) - f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x)) - faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x)) - fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x)) - fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x)) + subq $40, %rsp + cfi_adjust_cfa_offset (40) + fstpt 16(%rsp) // x + fstpt (%rsp) // + mov %edx, 32(%rsp) + call HIDDEN_JUMPTARGET (__powl_helper) // + mov 32(%rsp), %edx + addq $40, %rsp + cfi_adjust_cfa_offset (-40) testb $2, %dh jz 292f // x is negative. If y is an odd integer, negate the result. diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index f33dfed3df..9e7a8adac3 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -2398,6 +2398,8 @@ ifloat: 1 Test "pow (0x0.ffffffp0, -0x1p24) == 2.7182819094701610539628664526874952929416": float: 1 ifloat: 1 +ildouble: 1 +ldouble: 1 Test "pow (0x0.ffffffp0, 0x1p24) == 0.3678794302077803437135155590023422899744": float: 1 ifloat: 1 @@ -3575,6 +3577,8 @@ ifloat: 1 Function: "pow": float: 1 ifloat: 1 +ildouble: 1 +ldouble: 1 Function: "pow_downward": float: 1 -- cgit 1.4.1