6 files changed, 274 insertions, 60 deletions
diff --git a/sysdeps/i386/fpu/e_powl.S b/sysdeps/i386/fpu/e_powl.S
index ac4842cf63..7e297756ff 100644
--- a/sysdeps/i386/fpu/e_powl.S
+++ b/sysdeps/i386/fpu/e_powl.S
@@ -26,9 +26,9 @@
 	.type one,@object
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
-	.type limit,@object
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
+	.type p3,@object
+p3:	.byte 0, 0, 0, 0, 0, 0, 0x20, 0x40
+	ASM_SIZE_DIRECTIVE(p3)
 	.type p63,@object
 p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
 	ASM_SIZE_DIRECTIVE(p63)
@@ -141,7 +141,15 @@ ENTRY(__ieee754_powl)
 	fchs			// -0x1p-79 : x
 	jmp	3f
 
-9:	/* OK, we have an integer value for y.  */
+9:	/* OK, we have an integer value for y.  Unless very small
+	   (we use < 8), use the algorithm for real exponent to avoid
+	   accumulation of errors.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p3)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
 	popl	%eax
 	cfi_adjust_cfa_offset (-4)
 	popl	%edx
@@ -182,7 +190,7 @@ ENTRY(__ieee754_powl)
 
 	cfi_adjust_cfa_offset (8)
 	.align ALIGNARG(4)
-2:	// y is a large integer (absolute value at least 1L<<63), but
+2:	// y is a large integer (absolute value at least 8), but
 	// may be odd unless at least 1L<<64.  So it may be necessary
 	// to adjust the sign of a negative result afterwards.
 	fxch			// x : y
@@ -205,34 +213,21 @@ ENTRY(__ieee754_powl)
 	fchs			// -(1L<<78) : |x|
 	.align ALIGNARG(4)
 3:	/* y is a real number.  */
-	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
-	fldl	MO(limit)	// 0.29 : 1.0 : x : y
-	fld	%st(2)		// x : 0.29 : 1.0 : x : y
-	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
-	fabs			// |x-1| : 0.29 : 1.0 : x : y
-	fucompp			// 1.0 : x : y
-	fnstsw
-	fxch			// x : 1.0 : y
-	sahf
-	ja	7f
-	fsub	%st(1)		// x-1 : 1.0 : y
-	fyl2xp1			// log2(x) : y
-	jmp	8f
-
-7:	fyl2x			// log2(x) : y
-8:	fmul	%st(1)		// y*log2(x) : y
-	fst	%st(1)		// y*log2(x) : y*log2(x)
-	frndint			// int(y*log2(x)) : y*log2(x)
-	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
-	fxch			// fract(y*log2(x)) : int(y*log2(x))
-	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
-	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
-	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
-	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	subl	$28, %esp
+	cfi_adjust_cfa_offset (28)
+	fstpt	12(%esp)	// x
+	fstpt	(%esp)		// <empty>
+	mov	%edx, 24(%esp)
+	call	HIDDEN_JUMPTARGET (__powl_helper)	// <result>
+	mov	24(%esp), %edx
+	addl	$28, %esp
+	cfi_adjust_cfa_offset (-28)
 	testb	$2, %dh
 	jz	292f
 	// x is negative.  If y is an odd integer, negate the result.
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
 	fldt	24(%esp)	// y : abs(result)
 	fld	%st		// y : y : abs(result)
 	fabs			// |y| : y : abs(result)
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 8be00860c9..5b595bc566 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -2480,6 +2480,11 @@ ifloat: 1
 ildouble: 1
 ldouble: 1
 
+# pow
+Test "pow (0x0.ffffffp0, -0x1p24) == 2.7182819094701610539628664526874952929416":
+ildouble: 1
+ldouble: 1
+
 # pow_downward
 Test "pow_downward (1.0625, 1.125) == 1.070582293028761362162622578677070098674":
 double: 1
@@ -3782,6 +3787,10 @@ ifloat: 1
 ildouble: 1
 ldouble: 1
 
+Function: "pow":
+ildouble: 1
+ldouble: 1
+
 Function: "pow_downward":
 double: 1
 float: 1
diff --git a/sysdeps/x86/fpu/Makefile b/sysdeps/x86/fpu/Makefile
new file mode 100644
index 0000000000..8054380477
--- /dev/null
+++ b/sysdeps/x86/fpu/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),math)
+libm-support += powl_helper
+endif
diff --git a/sysdeps/x86/fpu/powl_helper.c b/sysdeps/x86/fpu/powl_helper.c
new file mode 100644
index 0000000000..3f69b08a1b
--- /dev/null
+++ b/sysdeps/x86/fpu/powl_helper.c
@@ -0,0 +1,211 @@
+/* Implement powl for x86 using extra-precision log.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <math_private.h>
+
+/* High parts and low parts of -log (k/16), for integer k from 12 to
+   24.  */
+
+static const long double powl_log_table[] =
+  {
+    0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L,
+    0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L,
+    0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L,
+    0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L,
+    -0x0p+0L, 0x0p+0L,
+    -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L,
+    -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L,
+    -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L,
+    -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L,
+    -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L,
+    -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L,
+    -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L,
+    -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L,
+  };
+
+/* High 32 bits of log2 (e), and remainder rounded to 64 bits.  */
+static const long double log2e_hi = 0x1.71547652p+0L;
+static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L;
+
+/* Given a number with high part HI and low part LO, add the number X
+   to it and store the result in *RHI and *RLO.  It is given that
+   either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are
+   small enough that no overflow occurs.  The result does not need to
+   be exact to 128 bits; 78-bit accuracy of the final accumulated
+   result suffices.  */
+
+static inline void
+acc_split (long double *rhi, long double *rlo, long double hi, long double lo,
+	   long double x)
+{
+  long double thi = hi + x;
+  long double tlo = (hi - thi) + x + lo;
+  *rhi = thi + tlo;
+  *rlo = (thi - *rhi) + tlo;
+}
+
+extern long double __powl_helper (long double x, long double y);
+libm_hidden_proto (__powl_helper)
+
+/* Given X a value that is finite and nonzero, or a NaN, and only
+   negative if Y is not an integer, and Y a finite nonzero value with
+   0x1p-79 <= |Y| <= 0x1p78, compute X to the power Y.  */
+
+long double
+__powl_helper (long double x, long double y)
+{
+  if (isnan (x) || x < 0)
+    return __ieee754_expl (y * __ieee754_logl (x));
+
+  /* We need to compute Y * log2 (X) to at least 64 bits after the
+     point for normal results (that is, to at least 78 bits
+     precision).  */
+  int x_int_exponent;
+  long double x_frac;
+  x_frac = __frexpl (x, &x_int_exponent);
+  if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */
+    {
+      x_frac *= 2.0;
+      x_int_exponent--;
+    }
+
+  long double log_x_frac_hi, log_x_frac_lo;
+  /* Determine an initial approximation to log (X_FRAC) using
+     POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an
+     interval (24/25, 26/25).  */
+  int k = (int) ((16.0L / x_frac) + 0.5L);
+  log_x_frac_hi = powl_log_table[2 * k - 24];
+  log_x_frac_lo = powl_log_table[2 * k - 23];
+  long double x_frac_low;
+  if (k == 16)
+    x_frac_low = 0.0L;
+  else
+    {
+      /* Mask off low 5 bits of X_FRAC so the multiplication by K/16
+	 is exact.  These bits are small enough that they can be
+	 corrected for by adding log2 (e) * X_FRAC_LOW to the final
+	 result.  */
+      int32_t se;
+      u_int32_t i0, i1;
+      GET_LDOUBLE_WORDS (se, i0, i1, x_frac);
+      x_frac_low = x_frac;
+      i1 &= 0xffffffe0;
+      SET_LDOUBLE_WORDS (x_frac, se, i0, i1);
+      x_frac_low -= x_frac;
+      x_frac_low /= x_frac;
+      x_frac *= k / 16.0L;
+    }
+
+  /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25).  Separate
+     W = X_FRAC - 1 into high 16 bits and remaining bits, so that
+     multiplications for low-order power series terms are exact.  The
+     remaining bits are small enough that adding a 64-bit value of
+     log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for
+     them.  */
+  long double w = x_frac - 1;
+  long double w_hi, w_lo;
+  int32_t se;
+  u_int32_t i0, i1;
+  GET_LDOUBLE_WORDS (se, i0, i1, w);
+  i0 &= 0xffff0000;
+  i1 = 0;
+  SET_LDOUBLE_WORDS (w_hi, se, i0, i1);
+  w_lo = w - w_hi;
+  long double wp = w_hi;
+  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp);
+  wp *= -w_hi;
+  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
+	     wp / 2.0L);
+  wp *= -w_hi;
+  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
+	     wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part.  */
+  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
+	     wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part.  */
+  wp *= -w_hi;
+  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
+	     wp / 4.0L);
+  /* Subsequent terms are small enough that they only need be computed
+     to 64 bits.  */
+  for (int i = 5; i <= 17; i++)
+    {
+      wp *= -w_hi;
+      acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
+		 wp / i);
+    }
+
+  /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm.  */
+  long double log2_x_frac_hi, log2_x_frac_lo;
+  long double log_x_frac_hi32, log_x_frac_lo64;
+  GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi);
+  i1 = 0;
+  SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1);
+  log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo;
+  long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi;
+  long double log2_x_frac_lo1
+    = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo;
+  log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1;
+  log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1;
+
+  /* Correct for the masking off of W_LO.  */
+  long double log2_1p_w_lo;
+  asm ("fyl2xp1"
+       : "=t" (log2_1p_w_lo)
+       : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L)
+       : "st(1)");
+  acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
+	     log2_1p_w_lo);
+
+  /* Correct for the masking off of X_FRAC_LOW.  */
+  acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
+	     x_frac_low * M_LOG2El);
+
+  /* Add the integer and fractional parts of the base-2 logarithm.  */
+  long double log2_x_hi, log2_x_lo;
+  log2_x_hi = x_int_exponent + log2_x_frac_hi;
+  log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo;
+
+  /* Compute the base-2 logarithm of the result.  */
+  long double log2_res_hi, log2_res_lo;
+  long double log2_x_hi32, log2_x_lo64;
+  GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi);
+  i1 = 0;
+  SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1);
+  log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo;
+  long double y_hi32, y_lo32;
+  GET_LDOUBLE_WORDS (se, i0, i1, y);
+  i1 = 0;
+  SET_LDOUBLE_WORDS (y_hi32, se, i0, i1);
+  y_lo32 = y - y_hi32;
+  log2_res_hi = log2_x_hi32 * y_hi32;
+  log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y;
+
+  /* Split the base-2 logarithm of the result into integer and
+     fractional parts.  */
+  long double log2_res_int = __roundl (log2_res_hi);
+  long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo;
+
+  /* Compute the final result.  */
+  long double res;
+  asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac));
+  res += 1.0L;
+  asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int));
+  return res;
+}
+
+libm_hidden_def (__powl_helper)
diff --git a/sysdeps/x86_64/fpu/e_powl.S b/sysdeps/x86_64/fpu/e_powl.S
index 1b3718522d..ff96cec68a 100644
--- a/sysdeps/x86_64/fpu/e_powl.S
+++ b/sysdeps/x86_64/fpu/e_powl.S
@@ -26,9 +26,9 @@
 	.type one,@object
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
-	.type limit,@object
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
+	.type p3,@object
+p3:	.byte 0, 0, 0, 0, 0, 0, 0x20, 0x40
+	ASM_SIZE_DIRECTIVE(p3)
 	.type p63,@object
 p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
 	ASM_SIZE_DIRECTIVE(p63)
@@ -131,7 +131,15 @@ ENTRY(__ieee754_powl)
 	fchs			// -0x1p-79 : x
 	jmp	3f
 
-9:	/* OK, we have an integer value for y.  */
+9:	/* OK, we have an integer value for y.  Unless very small
+	   (we use < 8), use the algorithm for real exponent to avoid
+	   accumulation of errors.  */
+	fldl	MO(p3)		// 8 : y : x
+	fld	%st(1)		// y : 8 : y : x
+	fabs			// |y| : 8 : y : x
+	fcomip	%st(1), %st	// 8 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
 	mov	-8(%rsp),%eax
 	mov	-4(%rsp),%edx
 	orl	$0, %edx
@@ -167,7 +175,7 @@ ENTRY(__ieee754_powl)
 	ret
 
 	.align ALIGNARG(4)
-2:	// y is a large integer (absolute value at least 1L<<63), but
+2:	// y is a large integer (absolute value at least 8), but
 	// may be odd unless at least 1L<<64.  So it may be necessary
 	// to adjust the sign of a negative result afterwards.
 	fxch			// x : y
@@ -190,31 +198,15 @@ ENTRY(__ieee754_powl)
 	fchs			// -(1L<<78) : |x|
 	.align ALIGNARG(4)
 3:	/* y is a real number.  */
-	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
-	fldl	MO(limit)	// 0.29 : 1.0 : x : y
-	fld	%st(2)		// x : 0.29 : 1.0 : x : y
-	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
-	fabs			// |x-1| : 0.29 : 1.0 : x : y
-	fucompp			// 1.0 : x : y
-	fnstsw
-	fxch			// x : 1.0 : y
-	test	$0x4500,%eax
-	jz	7f
-	fsub	%st(1)		// x-1 : 1.0 : y
-	fyl2xp1			// log2(x) : y
-	jmp	8f
-
-7:	fyl2x			// log2(x) : y
-8:	fmul	%st(1)		// y*log2(x) : y
-	fst	%st(1)		// y*log2(x) : y*log2(x)
-	frndint			// int(y*log2(x)) : y*log2(x)
-	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
-	fxch			// fract(y*log2(x)) : int(y*log2(x))
-	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
-	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
-	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
-	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	subq	$40, %rsp
+	cfi_adjust_cfa_offset (40)
+	fstpt	16(%rsp)	// x
+	fstpt	(%rsp)		// <empty>
+	mov	%edx, 32(%rsp)
+	call	HIDDEN_JUMPTARGET (__powl_helper)	// <result>
+	mov	32(%rsp), %edx
+	addq	$40, %rsp
+	cfi_adjust_cfa_offset (-40)
 	testb	$2, %dh
 	jz	292f
 	// x is negative.  If y is an odd integer, negate the result.
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index f33dfed3df..9e7a8adac3 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -2398,6 +2398,8 @@ ifloat: 1
 Test "pow (0x0.ffffffp0, -0x1p24) == 2.7182819094701610539628664526874952929416":
 float: 1
 ifloat: 1
+ildouble: 1
+ldouble: 1
 Test "pow (0x0.ffffffp0, 0x1p24) == 0.3678794302077803437135155590023422899744":
 float: 1
 ifloat: 1
@@ -3575,6 +3577,8 @@ ifloat: 1
 Function: "pow":
 float: 1
 ifloat: 1
+ildouble: 1
+ldouble: 1
 
 Function: "pow_downward":
 float: 1