Add new exp and exp2 implementations

Optimized exp and exp2 implementations using a lookup table for fractional powers of 2. There are several variants, see e_exp_data.c, they can be selected by modifying math_config.h allowing different tradeoffs. The default selection should be acceptable as generic libm code. Worst case error is 0.509 ULP for exp and 0.507 ULP for exp2, on aarch64 the rodata size is 2160 bytes, shared between exp and exp2. On aarch64 .text + .rodata size decreased by 24912 bytes. The non-nearest rounding error is less than 1 ULP even on targets without efficient round implementation (although the error rate is higher in that case). Targets with single instruction, rounding mode independent, to nearest integer rounding and conversion can use them by setting TOINT_INTRINSICS and adding the necessary code to their math_private.h. The __exp1 code uses the same algorithm, so the error bound of pow increased a bit. New double precision error handling code was added following the style of the single precision error handling code. Improvements on Cortex-A72 compared to current glibc master: exp thruput: 1.61x in [-9.9 9.9] exp latency: 1.53x in [-9.9 9.9] exp thruput: 1.13x in [0.5 1] exp latency: 1.30x in [0.5 1] exp2 thruput: 2.03x in [-9.9 9.9] exp2 latency: 1.64x in [-9.9 9.9] For small (< 1) inputs the current exp code uses a separate algorithm so the speed up there is less. Was tested on aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and arm-linux-gnueabihf (!TOINT_INTRINSICS, no fma contraction) and x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction) and powerpc64le-linux-gnu (!TOINT_INTRINSICS, fma contraction) targets, only non-nearest rounding ulp errors increase and they are within acceptable bounds (ulp updates are in separate patches). * NEWS: Mention exp and exp2 improvements. * math/Makefile (libm-support): Remove t_exp. (type-double-routines): Add math_err and e_exp_data. * sysdeps/aarch64/libm-test-ulps: Update. * sysdeps/arm/libm-test-ulps: Update. * sysdeps/i386/fpu/e_exp_data.c: New file. * sysdeps/i386/fpu/math_err.c: New file. * sysdeps/i386/fpu/t_exp.c: Remove. * sysdeps/ia64/fpu/e_exp_data.c: New file. * sysdeps/ia64/fpu/math_err.c: New file. * sysdeps/ia64/fpu/t_exp.c: Remove. * sysdeps/ieee754/dbl-64/e_exp.c: Rewrite. * sysdeps/ieee754/dbl-64/e_exp2.c: Rewrite. * sysdeps/ieee754/dbl-64/e_exp_data.c: New file. * sysdeps/ieee754/dbl-64/e_pow.c (__ieee754_pow): Update error bound. * sysdeps/ieee754/dbl-64/eexp.tbl: Remove. * sysdeps/ieee754/dbl-64/math_config.h: New file. * sysdeps/ieee754/dbl-64/math_err.c: New file. * sysdeps/ieee754/dbl-64/t_exp.c: Remove. * sysdeps/ieee754/dbl-64/t_exp2.h: Remove. * sysdeps/ieee754/dbl-64/uexp.h: Remove. * sysdeps/ieee754/dbl-64/uexp.tbl: Remove. * sysdeps/m68k/m680x0/fpu/e_exp_data.c: New file. * sysdeps/m68k/m680x0/fpu/math_err.c: New file. * sysdeps/m68k/m680x0/fpu/t_exp.c: Remove. * sysdeps/powerpc/fpu/libm-test-ulps: Update. * sysdeps/x86_64/fpu/libm-test-ulps: Update.
author: Szabolcs Nagy <szabolcs.nagy@arm.com> 2018-02-12 18:16:03 +0000
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2018-09-05 16:22:00 +0100
commit: e70c17682518fab2fad164fecf73341443bc2ed3 (patch)
tree: b51780512cbcf9b0ded12c313b7e034bf7162273 /sysdeps/ieee754/dbl-64/e_exp2.c
parent: b7cdc2aeb16c07fd9e6ec59f96f862b7fe2d3fdd (diff)
download: glibc-e70c17682518fab2fad164fecf73341443bc2ed3.tar.gz
glibc-e70c17682518fab2fad164fecf73341443bc2ed3.tar.xz
glibc-e70c17682518fab2fad164fecf73341443bc2ed3.zip
1 files changed, 115 insertions, 106 deletions
diff --git a/sysdeps/ieee754/dbl-64/e_exp2.c b/sysdeps/ieee754/dbl-64/e_exp2.c
index d07e787ec8..96afcf9c20 100644
--- a/sysdeps/ieee754/dbl-64/e_exp2.c
+++ b/sysdeps/ieee754/dbl-64/e_exp2.c
@@ -1,7 +1,6 @@
-/* Double-precision floating point 2^x.
-   Copyright (C) 1997-2018 Free Software Foundation, Inc.
+/* Double-precision 2^x function.
+   Copyright (C) 2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Geoffrey Keating <geoffk@ozemail.com.au>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,120 +16,130 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* The basic design here is from
-   Shmuel Gal and Boris Bachelis, "An Accurate Elementary Mathematical
-   Library for the IEEE Floating Point Standard", ACM Trans. Math. Soft.,
-   17 (1), March 1991, pp. 26-45.
-   It has been slightly modified to compute 2^x instead of e^x.
-   */
-#include <stdlib.h>
-#include <float.h>
-#include <ieee754.h>
 #include <math.h>
-#include <fenv.h>
-#include <inttypes.h>
+#include <stdint.h>
 #include <math-barriers.h>
-#include <math_private.h>
-#include <fenv_private.h>
-#include <math-underflow.h>
+#include <math-narrow-eval.h>
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define Shift __exp_data.exp2_shift
+#define T __exp_data.tab
+#define C1 __exp_data.exp2_poly[0]
+#define C2 __exp_data.exp2_poly[1]
+#define C3 __exp_data.exp2_poly[2]
+#define C4 __exp_data.exp2_poly[3]
+#define C5 __exp_data.exp2_poly[4]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
 
-#include "t_exp2.h"
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by 1.  */
+      sbits -= 1ull << 52;
+      scale = asdouble (sbits);
+      y = 2 * (scale + scale * tmp);
+      return check_oflow (y);
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = math_narrow_eval (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return check_uflow (y);
+}
 
-static const double TWO1023 = 8.988465674311579539e+307;
-static const double TWOM1000 = 9.3326361850321887899e-302;
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+top12 (double x)
+{
+  return asuint64 (x) >> 52;
+}
 
 double
 __ieee754_exp2 (double x)
 {
-  static const double himark = (double) DBL_MAX_EXP;
-  static const double lomark = (double) (DBL_MIN_EXP - DBL_MANT_DIG - 1);
-
-  /* Check for usual case.  */
-  if (__glibc_likely (isless (x, himark)))
+  uint32_t abstop;
+  uint64_t ki, idx, top, sbits;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, r, r2, scale, tail, tmp;
+
+  abstop = top12 (x) & 0x7ff;
+  if (__glibc_unlikely (abstop - top12 (0x1p-54)
+			>= top12 (512.0) - top12 (0x1p-54)))
     {
-      /* Exceptional cases:  */
-      if (__glibc_unlikely (!isgreaterequal (x, lomark)))
-	{
-	  if (isinf (x))
-	    /* e^-inf == 0, with no error.  */
-	    return 0;
-	  else
-	    /* Underflow */
-	    return TWOM1000 * TWOM1000;
-	}
-
-      static const double THREEp42 = 13194139533312.0;
-      int tval, unsafe;
-      double rx, x22, result;
-      union ieee754_double ex2_u, scale_u;
-
-      if (fabs (x) < DBL_EPSILON / 4.0)
-	return 1.0 + x;
-
-      {
-	SET_RESTORE_ROUND_NOEX (FE_TONEAREST);
-
-	/* 1. Argument reduction.
-	   Choose integers ex, -256 <= t < 256, and some real
-	   -1/1024 <= x1 <= 1024 so that
-	   x = ex + t/512 + x1.
-
-	   First, calculate rx = ex + t/512.  */
-	rx = x + THREEp42;
-	rx -= THREEp42;
-	x -= rx;  /* Compute x=x1. */
-	/* Compute tval = (ex*512 + t)+256.
-	   Now, t = (tval mod 512)-256 and ex=tval/512  [that's mod, NOT %;
-	   and /-round-to-nearest not the usual c integer /].  */
-	tval = (int) (rx * 512.0 + 256.0);
-
-	/* 2. Adjust for accurate table entry.
-	   Find e so that
-	   x = ex + t/512 + e + x2
-	   where -1e6 < e < 1e6, and
-	   (double)(2^(t/512+e))
-	   is accurate to one part in 2^-64.  */
-
-	/* 'tval & 511' is the same as 'tval%512' except that it's always
-	   positive.
-	   Compute x = x2.  */
-	x -= exp2_deltatable[tval & 511];
-
-	/* 3. Compute ex2 = 2^(t/512+e+ex).  */
-	ex2_u.d = exp2_accuratetable[tval & 511];
-	tval >>= 9;
-	/* x2 is an integer multiple of 2^-54; avoid intermediate
-	   underflow from the calculation of x22 * x.  */
-	unsafe = abs (tval) >= -DBL_MIN_EXP - 56;
-	ex2_u.ieee.exponent += tval >> unsafe;
-	scale_u.d = 1.0;
-	scale_u.ieee.exponent += tval - (tval >> unsafe);
-
-	/* 4. Approximate 2^x2 - 1, using a fourth-degree polynomial,
-	   with maximum error in [-2^-10-2^-30,2^-10+2^-30]
-	   less than 10^-19.  */
-
-	x22 = (((.0096181293647031180
-		 * x + .055504110254308625)
-		* x + .240226506959100583)
-	       * x + .69314718055994495) * ex2_u.d;
-	math_opt_barrier (x22);
-      }
-
-      /* 5. Return (2^x2-1) * 2^(t/512+e+ex) + 2^(t/512+e+ex).  */
-      result = x22 * x + ex2_u.d;
-
-      if (!unsafe)
-	return result;
-      else
+      if (abstop - top12 (0x1p-54) >= 0x80000000)
+	/* Avoid spurious underflow for tiny x.  */
+	/* Note: 0 is common input.  */
+	return WANT_ROUNDING ? 1.0 + x : 1.0;
+      if (abstop >= top12 (1024.0))
 	{
-	  result *= scale_u.d;
-	  math_check_force_underflow_nonneg (result);
-	  return result;
+	  if (asuint64 (x) == asuint64 (-INFINITY))
+	    return 0.0;
+	  if (abstop >= top12 (INFINITY))
+	    return 1.0 + x;
+	  if (!(asuint64 (x) >> 63))
+	    return __math_oflow (0);
+	  else if (asuint64 (x) >= asuint64 (-1075.0))
+	    return __math_uflow (0);
 	}
+      if (2 * asuint64 (x) > 2 * asuint64 (928.0))
+	/* Large x is special cased below.  */
+	abstop = 0;
     }
-  else
-    /* Return x, if x is a NaN or Inf; or overflow, otherwise.  */
-    return TWO1023 * x;
+
+  /* exp2(x) = 2^(k/N) * 2^r, with 2^r in [2^(-1/2N),2^(1/2N)].  */
+  /* x = k/N + r, with int k and r in [-1/2N, 1/2N].  */
+  kd = math_narrow_eval (x + Shift);
+  ki = asuint64 (kd); /* k.  */
+  kd -= Shift; /* k/N for int k.  */
+  r = x - kd;
+  /* 2^(k/N) ~= scale * (1 + tail).  */
+  idx = 2 * (ki % N);
+  top = ki << (52 - EXP_TABLE_BITS);
+  tail = asdouble (T[idx]);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  sbits = T[idx + 1] + top;
+  /* exp2(x) = 2^(k/N) * 2^r ~= scale + scale * (tail + 2^r - 1).  */
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  r2 = r * r;
+  /* Without fma the worst case error is 0.5/N ulp larger.  */
+  /* Worst case error is less than 0.5+0.86/N+(abs poly error * 2^53) ulp.  */
+  tmp = tail + r * C1 + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+  if (__glibc_unlikely (abstop == 0))
+    return specialcase (tmp, sbits, ki);
+  scale = asdouble (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-65 and scale > 2^-928, so there
+     is no spurious underflow here even without fma.  */
+  return scale + scale * tmp;
 }
+#ifndef __ieee754_exp2
 strong_alias (__ieee754_exp2, __exp2_finite)
+#endif
author	Szabolcs Nagy <szabolcs.nagy@arm.com>	2018-02-12 18:16:03 +0000
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2018-09-05 16:22:00 +0100
commit	e70c17682518fab2fad164fecf73341443bc2ed3 (patch)
tree	b51780512cbcf9b0ded12c313b7e034bf7162273 /sysdeps/ieee754/dbl-64/e_exp2.c
parent	b7cdc2aeb16c07fd9e6ec59f96f862b7fe2d3fdd (diff)
download	glibc-e70c17682518fab2fad164fecf73341443bc2ed3.tar.gz glibc-e70c17682518fab2fad164fecf73341443bc2ed3.tar.xz glibc-e70c17682518fab2fad164fecf73341443bc2ed3.zip