diff options
author | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2018-06-13 17:40:19 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2018-09-12 17:33:30 +0100 |
commit | f41b0a43e426831e391cafd8d0bd47a3efa4a840 (patch) | |
tree | b60d5ee7f8c6f496089d9cd45b048e072ac1a564 /sysdeps/ieee754/dbl-64/e_log.c | |
parent | 5a274db4ea363d6b0b92933f085a92daaf1be2f2 (diff) | |
download | glibc-f41b0a43e426831e391cafd8d0bd47a3efa4a840.tar.gz glibc-f41b0a43e426831e391cafd8d0bd47a3efa4a840.tar.xz glibc-f41b0a43e426831e391cafd8d0bd47a3efa4a840.zip |
Add new log implementation
Optimized log using carefully generated lookup table with 1/c and log(c) values for small intervalls around 1. The log(c) is very near a double precision value, it has about 62 bits precision. The algorithm is log(2^k x) = k log(2) + log(c) + log(x/c), where the last term is approximated by a polynomial of x/c - 1. Near 1 a single polynomial of x - 1 is used. There is separate code path when fma instruction is not available for computing x/c - 1 precisely, in which case the table size is doubled. The code uses __builtin_fma under __FP_FAST_FMA to ensure it is inlined as an instruction. With the default configuration settings the worst case error is 0.519 ULP (and 0.520 without fma), the rodata size is 2192 bytes (4240 without fma). The non-nearest rounding error is less than 1 ULP. Improvements on Cortex-A72 compared to current glibc master: log thruput: 3.28x in [0.01 11.1] log latency: 2.23x in [0.01 11.1] log thruput: 1.56x in [0.999 1.001] log latency: 1.57x in [0.999 1.001] Tested on aarch64-linux-gnu (defined __FP_FAST_FMA) arm-linux-gnueabihf (!defined __FP_FAST_FMA) x86_64-linux-gnu (!defined __FP_FAST_FMA) powerpc64le-linux-gnu (defined __FP_FAST_FMA) targets. * NEWS: Mention log improvement. * math/Makefile (type-double-routines): Add e_log_data. * sysdeps/i386/fpu/e_log_data.c: New file. * sysdeps/ia64/fpu/e_log_data.c: New file. * sysdeps/ieee754/dbl-64/e_log.c: Rewrite. * sysdeps/ieee754/dbl-64/e_log_data.c: New file. * sysdeps/ieee754/dbl-64/math_config.h (__log_data): Add. * sysdeps/ieee754/dbl-64/ulog.h: Remove. * sysdeps/ieee754/dbl-64/ulog.tbl: Remove. * sysdeps/m68k/m680x0/fpu/e_log_data.c: New file.
Diffstat (limited to 'sysdeps/ieee754/dbl-64/e_log.c')
-rw-r--r-- | sysdeps/ieee754/dbl-64/e_log.c | 257 |
1 files changed, 111 insertions, 146 deletions
diff --git a/sysdeps/ieee754/dbl-64/e_log.c b/sysdeps/ieee754/dbl-64/e_log.c index 2483dd8551..a56b714fb7 100644 --- a/sysdeps/ieee754/dbl-64/e_log.c +++ b/sysdeps/ieee754/dbl-64/e_log.c @@ -1,167 +1,132 @@ -/* - * IBM Accurate Mathematical Library - * written by International Business Machines Corp. - * Copyright (C) 2001-2018 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ -/*********************************************************************/ -/* */ -/* MODULE_NAME:ulog.c */ -/* */ -/* FUNCTION:ulog */ -/* */ -/* FILES NEEDED: dla.h endian.h mpa.h mydefs.h ulog.h */ -/* ulog.tbl */ -/* */ -/* An ultimate log routine. Given an IEEE double machine number x */ -/* it computes the rounded (to nearest) value of log(x). */ -/* Assumption: Machine arithmetic operations are performed in */ -/* round to nearest mode of IEEE 754 standard. */ -/* */ -/*********************************************************************/ +/* Double-precision log(x) function. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ -#include "endian.h" -#include <dla.h> -#include "mpa.h" -#include "MathLib.h" #include <math.h> -#include <math_private.h> +#include <stdint.h> +#include "math_config.h" + +#define T __log_data.tab +#define T2 __log_data.tab2 +#define B __log_data.poly1 +#define A __log_data.poly +#define Ln2hi __log_data.ln2hi +#define Ln2lo __log_data.ln2lo +#define N (1 << LOG_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t +top16 (double x) +{ + return asuint64 (x) >> 48; +} #ifndef SECTION # define SECTION #endif -/*********************************************************************/ -/* An ultimate log routine. Given an IEEE double machine number x */ -/* it computes the rounded (to nearest) value of log(x). */ -/*********************************************************************/ double SECTION __ieee754_log (double x) { - int i, j, n, ux, dx; - double dbl_n, u, p0, q, r0, w, nln2a, luai, lubi, lvaj, lvbj, - sij, ssij, ttij, A, B, B0, polI, polII, t8, a, aa, b, bb, c; -#ifndef DLA_FMS - double t1, t2, t3, t4, t5; -#endif - number num; - -#include "ulog.tbl" -#include "ulog.h" - - /* Treating special values of x ( x<=0, x=INF, x=NaN etc.). */ - - num.d = x; - ux = num.i[HIGH_HALF]; - dx = num.i[LOW_HALF]; - n = 0; - if (__glibc_unlikely (ux < 0x00100000)) + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64 (x); + top = top16 (x); + +#define LO asuint64 (1.0 - 0x1p-4) +#define HI asuint64 (1.0 + 0x1.09p-4) + if (__glibc_unlikely (ix - LO < HI - LO)) { - if (__glibc_unlikely (((ux & 0x7fffffff) | dx) == 0)) - return MHALF / 0.0; /* return -INF */ - if (__glibc_unlikely (ux < 0)) - return (x - x) / 0.0; /* return NaN */ - n -= 54; - x *= two54.d; /* scale x */ - num.d = x; + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && __glibc_unlikely (ix == asuint64 (1.0))) + return 0; + r = x - 1.0; + r2 = r * r; + r3 = r * r2; + y = r3 * (B[1] + r * B[2] + r2 * B[3] + + r3 * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); + /* Worst-case error is around 0.507 ULP. */ + w = r * 0x1p27; + double_t rhi = r + w - w; + double_t rlo = r - rhi; + w = rhi * rhi * B[0]; /* B[0] == -0.5. */ + hi = r + w; + lo = r - hi + w; + lo += B[0] * rlo * (rhi + r); + y += lo; + y += hi; + return y; } - if (__glibc_unlikely (ux >= 0x7ff00000)) - return x + x; /* INF or NaN */ - - /* Regular values of x */ - - w = x - 1; - if (__glibc_likely (fabs (w) > U03)) - goto case_03; - - /* log (1) is +0 in all rounding modes. */ - if (w == 0.0) - return 0.0; - - /*--- The case abs(x-1) < 0.03 */ - - t8 = MHALF * w; - EMULV (t8, w, a, aa, t1, t2, t3, t4, t5); - EADD (w, a, b, bb); - /* Evaluate polynomial II */ - polII = b7.d + w * b8.d; - polII = b6.d + w * polII; - polII = b5.d + w * polII; - polII = b4.d + w * polII; - polII = b3.d + w * polII; - polII = b2.d + w * polII; - polII = b1.d + w * polII; - polII = b0.d + w * polII; - polII *= w * w * w; - c = (aa + bb) + polII; - - /* Here b contains the high part of the result, and c the low part. - Maximum error is b * 2.334e-19, so accuracy is >61 bits. - Therefore max ULP error of b + c is ~0.502. */ - return b + c; - - /*--- The case abs(x-1) > 0.03 */ -case_03: - - /* Find n,u such that x = u*2**n, 1/sqrt(2) < u < sqrt(2) */ - n += (num.i[HIGH_HALF] >> 20) - 1023; - num.i[HIGH_HALF] = (num.i[HIGH_HALF] & 0x000fffff) | 0x3ff00000; - if (num.d > SQRT_2) + if (__glibc_unlikely (top - 0x0010 >= 0x7ff0 - 0x0010)) { - num.d *= HALF; - n++; + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero (1); + if (ix == asuint64 (INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid (x); + /* x is subnormal, normalize it. */ + ix = asuint64 (x * 0x1p52); + ix -= 52ULL << 52; } - u = num.d; - dbl_n = (double) n; - - /* Find i such that ui=1+(i-75)/2**8 is closest to u (i= 0,1,2,...,181) */ - num.d += h1.d; - i = (num.i[HIGH_HALF] & 0x000fffff) >> 12; - - /* Find j such that vj=1+(j-180)/2**16 is closest to v=u/ui (j= 0,...,361) */ - num.d = u * Iu[i].d + h2.d; - j = (num.i[HIGH_HALF] & 0x000fffff) >> 4; - /* Compute w=(u-ui*vj)/(ui*vj) */ - p0 = (1 + (i - 75) * DEL_U) * (1 + (j - 180) * DEL_V); - q = u - p0; - r0 = Iu[i].d * Iv[j].d; - w = q * r0; - - /* Evaluate polynomial I */ - polI = w + (a2.d + a3.d * w) * w * w; - - /* Add up everything */ - nln2a = dbl_n * LN2A; - luai = Lu[i][0].d; - lubi = Lu[i][1].d; - lvaj = Lv[j][0].d; - lvbj = Lv[j][1].d; - EADD (luai, lvaj, sij, ssij); - EADD (nln2a, sij, A, ttij); - B0 = (((lubi + lvbj) + ssij) + ttij) + dbl_n * LN2B; - B = polI + B0; - - /* Here A contains the high part of the result, and B the low part. - Maximum abs error is 6.095e-21 and min log (x) is 0.0295 since x > 1.03. - Therefore max ULP error of A + B is ~0.502. */ - return A + B; + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG_TABLE_BITS)) % N; + k = (int64_t) tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#ifdef __FP_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = __builtin_fma (z, invc, -1.0); +#else + /* rounding error: 0x1p-55/N + 0x1p-66. */ + r = (z - T2[i].chi - T2[i].clo) * invc; +#endif + kd = (double_t) k; + + /* hi + lo = r + log(c) + k*Ln2. */ + w = kd * Ln2hi + logc; + hi = w + r; + lo = w - hi + r + kd * Ln2lo; + + /* log(x) = lo + (log1p(r) - r) + hi. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + /* Worst case error if |y| > 0x1p-4: 0.519 ULP (0.520 ULP without fma). + 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma). */ + y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; + return y; } - #ifndef __ieee754_log strong_alias (__ieee754_log, __log_finite) #endif |