about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c')
-rw-r--r--REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c906
1 files changed, 906 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c b/REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c
new file mode 100644
index 0000000000..3820335172
--- /dev/null
+++ b/REORG.TODO/sysdeps/ieee754/dbl-64/mpa.c
@@ -0,0 +1,906 @@
+/*
+ * IBM Accurate Mathematical Library
+ * written by International Business Machines Corp.
+ * Copyright (C) 2001-2017 Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+/************************************************************************/
+/*  MODULE_NAME: mpa.c                                                  */
+/*                                                                      */
+/*  FUNCTIONS:                                                          */
+/*               mcr                                                    */
+/*               acr                                                    */
+/*               cpy                                                    */
+/*               norm                                                   */
+/*               denorm                                                 */
+/*               mp_dbl                                                 */
+/*               dbl_mp                                                 */
+/*               add_magnitudes                                         */
+/*               sub_magnitudes                                         */
+/*               add                                                    */
+/*               sub                                                    */
+/*               mul                                                    */
+/*               inv                                                    */
+/*               dvd                                                    */
+/*                                                                      */
+/* Arithmetic functions for multiple precision numbers.                 */
+/* Relative errors are bounded                                          */
+/************************************************************************/
+
+
+#include "endian.h"
+#include "mpa.h"
+#include <sys/param.h>
+#include <alloca.h>
+
+#ifndef SECTION
+# define SECTION
+#endif
+
+#ifndef NO__CONST
+const mp_no __mpone = { 1, { 1.0, 1.0 } };
+const mp_no __mptwo = { 1, { 1.0, 2.0 } };
+#endif
+
+#ifndef NO___ACR
+/* Compare mantissa of two multiple precision numbers regardless of the sign
+   and exponent of the numbers.  */
+static int
+mcr (const mp_no *x, const mp_no *y, int p)
+{
+  long i;
+  long p2 = p;
+  for (i = 1; i <= p2; i++)
+    {
+      if (X[i] == Y[i])
+	continue;
+      else if (X[i] > Y[i])
+	return 1;
+      else
+	return -1;
+    }
+  return 0;
+}
+
+/* Compare the absolute values of two multiple precision numbers.  */
+int
+__acr (const mp_no *x, const mp_no *y, int p)
+{
+  long i;
+
+  if (X[0] == 0)
+    {
+      if (Y[0] == 0)
+	i = 0;
+      else
+	i = -1;
+    }
+  else if (Y[0] == 0)
+    i = 1;
+  else
+    {
+      if (EX > EY)
+	i = 1;
+      else if (EX < EY)
+	i = -1;
+      else
+	i = mcr (x, y, p);
+    }
+
+  return i;
+}
+#endif
+
+#ifndef NO___CPY
+/* Copy multiple precision number X into Y.  They could be the same
+   number.  */
+void
+__cpy (const mp_no *x, mp_no *y, int p)
+{
+  long i;
+
+  EY = EX;
+  for (i = 0; i <= p; i++)
+    Y[i] = X[i];
+}
+#endif
+
+#ifndef NO___MP_DBL
+/* Convert a multiple precision number *X into a double precision
+   number *Y, normalized case (|x| >= 2**(-1022))).  X has precision
+   P, which is positive.  */
+static void
+norm (const mp_no *x, double *y, int p)
+{
+# define R RADIXI
+  long i;
+  double c;
+  mantissa_t a, u, v, z[5];
+  if (p < 5)
+    {
+      if (p == 1)
+	c = X[1];
+      else if (p == 2)
+	c = X[1] + R * X[2];
+      else if (p == 3)
+	c = X[1] + R * (X[2] + R * X[3]);
+      else /* p == 4.  */
+	c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
+    }
+  else
+    {
+      for (a = 1, z[1] = X[1]; z[1] < TWO23; )
+	{
+	  a *= 2;
+	  z[1] *= 2;
+	}
+
+      for (i = 2; i < 5; i++)
+	{
+	  mantissa_store_t d, r;
+	  d = X[i] * (mantissa_store_t) a;
+	  DIV_RADIX (d, r);
+	  z[i] = r;
+	  z[i - 1] += d;
+	}
+
+      u = ALIGN_DOWN_TO (z[3], TWO19);
+      v = z[3] - u;
+
+      if (v == TWO18)
+	{
+	  if (z[4] == 0)
+	    {
+	      for (i = 5; i <= p; i++)
+		{
+		  if (X[i] == 0)
+		    continue;
+		  else
+		    {
+		      z[3] += 1;
+		      break;
+		    }
+		}
+	    }
+	  else
+	    z[3] += 1;
+	}
+
+      c = (z[1] + R * (z[2] + R * z[3])) / a;
+    }
+
+  c *= X[0];
+
+  for (i = 1; i < EX; i++)
+    c *= RADIX;
+  for (i = 1; i > EX; i--)
+    c *= RADIXI;
+
+  *y = c;
+# undef R
+}
+
+/* Convert a multiple precision number *X into a double precision
+   number *Y, Denormal case  (|x| < 2**(-1022))).  */
+static void
+denorm (const mp_no *x, double *y, int p)
+{
+  long i, k;
+  long p2 = p;
+  double c;
+  mantissa_t u, z[5];
+
+# define R RADIXI
+  if (EX < -44 || (EX == -44 && X[1] < TWO5))
+    {
+      *y = 0;
+      return;
+    }
+
+  if (p2 == 1)
+    {
+      if (EX == -42)
+	{
+	  z[1] = X[1] + TWO10;
+	  z[2] = 0;
+	  z[3] = 0;
+	  k = 3;
+	}
+      else if (EX == -43)
+	{
+	  z[1] = TWO10;
+	  z[2] = X[1];
+	  z[3] = 0;
+	  k = 2;
+	}
+      else
+	{
+	  z[1] = TWO10;
+	  z[2] = 0;
+	  z[3] = X[1];
+	  k = 1;
+	}
+    }
+  else if (p2 == 2)
+    {
+      if (EX == -42)
+	{
+	  z[1] = X[1] + TWO10;
+	  z[2] = X[2];
+	  z[3] = 0;
+	  k = 3;
+	}
+      else if (EX == -43)
+	{
+	  z[1] = TWO10;
+	  z[2] = X[1];
+	  z[3] = X[2];
+	  k = 2;
+	}
+      else
+	{
+	  z[1] = TWO10;
+	  z[2] = 0;
+	  z[3] = X[1];
+	  k = 1;
+	}
+    }
+  else
+    {
+      if (EX == -42)
+	{
+	  z[1] = X[1] + TWO10;
+	  z[2] = X[2];
+	  k = 3;
+	}
+      else if (EX == -43)
+	{
+	  z[1] = TWO10;
+	  z[2] = X[1];
+	  k = 2;
+	}
+      else
+	{
+	  z[1] = TWO10;
+	  z[2] = 0;
+	  k = 1;
+	}
+      z[3] = X[k];
+    }
+
+  u = ALIGN_DOWN_TO (z[3], TWO5);
+
+  if (u == z[3])
+    {
+      for (i = k + 1; i <= p2; i++)
+	{
+	  if (X[i] == 0)
+	    continue;
+	  else
+	    {
+	      z[3] += 1;
+	      break;
+	    }
+	}
+    }
+
+  c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
+
+  *y = c * TWOM1032;
+# undef R
+}
+
+/* Convert multiple precision number *X into double precision number *Y.  The
+   result is correctly rounded to the nearest/even.  */
+void
+__mp_dbl (const mp_no *x, double *y, int p)
+{
+  if (X[0] == 0)
+    {
+      *y = 0;
+      return;
+    }
+
+  if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
+    norm (x, y, p);
+  else
+    denorm (x, y, p);
+}
+#endif
+
+/* Get the multiple precision equivalent of X into *Y.  If the precision is too
+   small, the result is truncated.  */
+void
+SECTION
+__dbl_mp (double x, mp_no *y, int p)
+{
+  long i, n;
+  long p2 = p;
+
+  /* Sign.  */
+  if (x == 0)
+    {
+      Y[0] = 0;
+      return;
+    }
+  else if (x > 0)
+    Y[0] = 1;
+  else
+    {
+      Y[0] = -1;
+      x = -x;
+    }
+
+  /* Exponent.  */
+  for (EY = 1; x >= RADIX; EY += 1)
+    x *= RADIXI;
+  for (; x < 1; EY -= 1)
+    x *= RADIX;
+
+  /* Digits.  */
+  n = MIN (p2, 4);
+  for (i = 1; i <= n; i++)
+    {
+      INTEGER_OF (x, Y[i]);
+      x *= RADIX;
+    }
+  for (; i <= p2; i++)
+    Y[i] = 0;
+}
+
+/* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0.  The
+   sign of the sum *Z is not changed.  X and Y may overlap but not X and Z or
+   Y and Z.  No guard digit is used.  The result equals the exact sum,
+   truncated.  */
+static void
+SECTION
+add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  long i, j, k;
+  long p2 = p;
+  mantissa_t zk;
+
+  EZ = EX;
+
+  i = p2;
+  j = p2 + EY - EX;
+  k = p2 + 1;
+
+  if (__glibc_unlikely (j < 1))
+    {
+      __cpy (x, z, p);
+      return;
+    }
+
+  zk = 0;
+
+  for (; j > 0; i--, j--)
+    {
+      zk += X[i] + Y[j];
+      if (zk >= RADIX)
+	{
+	  Z[k--] = zk - RADIX;
+	  zk = 1;
+	}
+      else
+	{
+	  Z[k--] = zk;
+	  zk = 0;
+	}
+    }
+
+  for (; i > 0; i--)
+    {
+      zk += X[i];
+      if (zk >= RADIX)
+	{
+	  Z[k--] = zk - RADIX;
+	  zk = 1;
+	}
+      else
+	{
+	  Z[k--] = zk;
+	  zk = 0;
+	}
+    }
+
+  if (zk == 0)
+    {
+      for (i = 1; i <= p2; i++)
+	Z[i] = Z[i + 1];
+    }
+  else
+    {
+      Z[1] = zk;
+      EZ += 1;
+    }
+}
+
+/* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
+   The sign of the difference *Z is not changed.  X and Y may overlap but not X
+   and Z or Y and Z.  One guard digit is used.  The error is less than one
+   ULP.  */
+static void
+SECTION
+sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  long i, j, k;
+  long p2 = p;
+  mantissa_t zk;
+
+  EZ = EX;
+  i = p2;
+  j = p2 + EY - EX;
+  k = p2;
+
+  /* Y is too small compared to X, copy X over to the result.  */
+  if (__glibc_unlikely (j < 1))
+    {
+      __cpy (x, z, p);
+      return;
+    }
+
+  /* The relevant least significant digit in Y is non-zero, so we factor it in
+     to enhance accuracy.  */
+  if (j < p2 && Y[j + 1] > 0)
+    {
+      Z[k + 1] = RADIX - Y[j + 1];
+      zk = -1;
+    }
+  else
+    zk = Z[k + 1] = 0;
+
+  /* Subtract and borrow.  */
+  for (; j > 0; i--, j--)
+    {
+      zk += (X[i] - Y[j]);
+      if (zk < 0)
+	{
+	  Z[k--] = zk + RADIX;
+	  zk = -1;
+	}
+      else
+	{
+	  Z[k--] = zk;
+	  zk = 0;
+	}
+    }
+
+  /* We're done with digits from Y, so it's just digits in X.  */
+  for (; i > 0; i--)
+    {
+      zk += X[i];
+      if (zk < 0)
+	{
+	  Z[k--] = zk + RADIX;
+	  zk = -1;
+	}
+      else
+	{
+	  Z[k--] = zk;
+	  zk = 0;
+	}
+    }
+
+  /* Normalize.  */
+  for (i = 1; Z[i] == 0; i++)
+    ;
+  EZ = EZ - i + 1;
+  for (k = 1; i <= p2 + 1; )
+    Z[k++] = Z[i++];
+  for (; k <= p2; )
+    Z[k++] = 0;
+}
+
+/* Add *X and *Y and store the result in *Z.  X and Y may overlap, but not X
+   and Z or Y and Z.  One guard digit is used.  The error is less than one
+   ULP.  */
+void
+SECTION
+__add (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  int n;
+
+  if (X[0] == 0)
+    {
+      __cpy (y, z, p);
+      return;
+    }
+  else if (Y[0] == 0)
+    {
+      __cpy (x, z, p);
+      return;
+    }
+
+  if (X[0] == Y[0])
+    {
+      if (__acr (x, y, p) > 0)
+	{
+	  add_magnitudes (x, y, z, p);
+	  Z[0] = X[0];
+	}
+      else
+	{
+	  add_magnitudes (y, x, z, p);
+	  Z[0] = Y[0];
+	}
+    }
+  else
+    {
+      if ((n = __acr (x, y, p)) == 1)
+	{
+	  sub_magnitudes (x, y, z, p);
+	  Z[0] = X[0];
+	}
+      else if (n == -1)
+	{
+	  sub_magnitudes (y, x, z, p);
+	  Z[0] = Y[0];
+	}
+      else
+	Z[0] = 0;
+    }
+}
+
+/* Subtract *Y from *X and return the result in *Z.  X and Y may overlap but
+   not X and Z or Y and Z.  One guard digit is used.  The error is less than
+   one ULP.  */
+void
+SECTION
+__sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  int n;
+
+  if (X[0] == 0)
+    {
+      __cpy (y, z, p);
+      Z[0] = -Z[0];
+      return;
+    }
+  else if (Y[0] == 0)
+    {
+      __cpy (x, z, p);
+      return;
+    }
+
+  if (X[0] != Y[0])
+    {
+      if (__acr (x, y, p) > 0)
+	{
+	  add_magnitudes (x, y, z, p);
+	  Z[0] = X[0];
+	}
+      else
+	{
+	  add_magnitudes (y, x, z, p);
+	  Z[0] = -Y[0];
+	}
+    }
+  else
+    {
+      if ((n = __acr (x, y, p)) == 1)
+	{
+	  sub_magnitudes (x, y, z, p);
+	  Z[0] = X[0];
+	}
+      else if (n == -1)
+	{
+	  sub_magnitudes (y, x, z, p);
+	  Z[0] = -Y[0];
+	}
+      else
+	Z[0] = 0;
+    }
+}
+
+#ifndef NO__MUL
+/* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
+   and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
+   digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
+void
+SECTION
+__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  long i, j, k, ip, ip2;
+  long p2 = p;
+  mantissa_store_t zk;
+  const mp_no *a;
+  mantissa_store_t *diag;
+
+  /* Is z=0?  */
+  if (__glibc_unlikely (X[0] * Y[0] == 0))
+    {
+      Z[0] = 0;
+      return;
+    }
+
+  /* We need not iterate through all X's and Y's since it's pointless to
+     multiply zeroes.  Here, both are zero...  */
+  for (ip2 = p2; ip2 > 0; ip2--)
+    if (X[ip2] != 0 || Y[ip2] != 0)
+      break;
+
+  a = X[ip2] != 0 ? y : x;
+
+  /* ... and here, at least one of them is still zero.  */
+  for (ip = ip2; ip > 0; ip--)
+    if (a->d[ip] != 0)
+      break;
+
+  /* The product looks like this for p = 3 (as an example):
+
+
+				a1    a2    a3
+		 x		b1    b2    b3
+		 -----------------------------
+			     a1*b3 a2*b3 a3*b3
+		       a1*b2 a2*b2 a3*b2
+		 a1*b1 a2*b1 a3*b1
+
+     So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
+     for P >= 3.  We compute the above digits in two parts; the last P-1
+     digits and then the first P digits.  The last P-1 digits are a sum of
+     products of the input digits from P to P-k where K is 0 for the least
+     significant digit and increases as we go towards the left.  The product
+     term is of the form X[k]*X[P-k] as can be seen in the above example.
+
+     The first P digits are also a sum of products with the same product term,
+     except that the sum is from 1 to k.  This is also evident from the above
+     example.
+
+     Another thing that becomes evident is that only the most significant
+     ip+ip2 digits of the result are non-zero, where ip and ip2 are the
+     'internal precision' of the input numbers, i.e. digits after ip and ip2
+     are all 0.  */
+
+  k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
+
+  while (k > ip + ip2 + 1)
+    Z[k--] = 0;
+
+  zk = 0;
+
+  /* Precompute sums of diagonal elements so that we can directly use them
+     later.  See the next comment to know we why need them.  */
+  diag = alloca (k * sizeof (mantissa_store_t));
+  mantissa_store_t d = 0;
+  for (i = 1; i <= ip; i++)
+    {
+      d += X[i] * (mantissa_store_t) Y[i];
+      diag[i] = d;
+    }
+  while (i < k)
+    diag[i++] = d;
+
+  while (k > p2)
+    {
+      long lim = k / 2;
+
+      if (k % 2 == 0)
+	/* We want to add this only once, but since we subtract it in the sum
+	   of products above, we add twice.  */
+	zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
+
+      for (i = k - p2, j = p2; i < j; i++, j--)
+	zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
+
+      zk -= diag[k - 1];
+
+      DIV_RADIX (zk, Z[k]);
+      k--;
+    }
+
+  /* The real deal.  Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
+     goes from 1 -> k - 1 and j goes the same range in reverse.  To reduce the
+     number of multiplications, we halve the range and if k is an even number,
+     add the diagonal element X[k/2]Y[k/2].  Through the half range, we compute
+     X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
+
+     This reduction tells us that we're summing two things, the first term
+     through the half range and the negative of the sum of the product of all
+     terms of X and Y in the full range.  i.e.
+
+     SUM(X[i] * Y[i]) for k terms.  This is precalculated above for each k in
+     a single loop so that it completes in O(n) time and can hence be directly
+     used in the loop below.  */
+  while (k > 1)
+    {
+      long lim = k / 2;
+
+      if (k % 2 == 0)
+	/* We want to add this only once, but since we subtract it in the sum
+	   of products above, we add twice.  */
+        zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
+
+      for (i = 1, j = k - 1; i < j; i++, j--)
+	zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
+
+      zk -= diag[k - 1];
+
+      DIV_RADIX (zk, Z[k]);
+      k--;
+    }
+  Z[k] = zk;
+
+  /* Get the exponent sum into an intermediate variable.  This is a subtle
+     optimization, where given enough registers, all operations on the exponent
+     happen in registers and the result is written out only once into EZ.  */
+  int e = EX + EY;
+
+  /* Is there a carry beyond the most significant digit?  */
+  if (__glibc_unlikely (Z[1] == 0))
+    {
+      for (i = 1; i <= p2; i++)
+	Z[i] = Z[i + 1];
+      e--;
+    }
+
+  EZ = e;
+  Z[0] = X[0] * Y[0];
+}
+#endif
+
+#ifndef NO__SQR
+/* Square *X and store result in *Y.  X and Y may not overlap.  For P in
+   [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
+   error is bounded by 1.001 ULP.  This is a faster special case of
+   multiplication.  */
+void
+SECTION
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+  long i, j, k, ip;
+  mantissa_store_t yk;
+
+  /* Is z=0?  */
+  if (__glibc_unlikely (X[0] == 0))
+    {
+      Y[0] = 0;
+      return;
+    }
+
+  /* We need not iterate through all X's since it's pointless to
+     multiply zeroes.  */
+  for (ip = p; ip > 0; ip--)
+    if (X[ip] != 0)
+      break;
+
+  k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+  while (k > 2 * ip + 1)
+    Y[k--] = 0;
+
+  yk = 0;
+
+  while (k > p)
+    {
+      mantissa_store_t yk2 = 0;
+      long lim = k / 2;
+
+      if (k % 2 == 0)
+	yk += X[lim] * (mantissa_store_t) X[lim];
+
+      /* In __mul, this loop (and the one within the next while loop) run
+         between a range to calculate the mantissa as follows:
+
+         Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
+		+ X[n] * Y[k]
+
+         For X == Y, we can get away with summing halfway and doubling the
+	 result.  For cases where the range size is even, the mid-point needs
+	 to be added separately (above).  */
+      for (i = k - p, j = p; i < j; i++, j--)
+	yk2 += X[i] * (mantissa_store_t) X[j];
+
+      yk += 2 * yk2;
+
+      DIV_RADIX (yk, Y[k]);
+      k--;
+    }
+
+  while (k > 1)
+    {
+      mantissa_store_t yk2 = 0;
+      long lim = k / 2;
+
+      if (k % 2 == 0)
+	yk += X[lim] * (mantissa_store_t) X[lim];
+
+      /* Likewise for this loop.  */
+      for (i = 1, j = k - 1; i < j; i++, j--)
+	yk2 += X[i] * (mantissa_store_t) X[j];
+
+      yk += 2 * yk2;
+
+      DIV_RADIX (yk, Y[k]);
+      k--;
+    }
+  Y[k] = yk;
+
+  /* Squares are always positive.  */
+  Y[0] = 1;
+
+  /* Get the exponent sum into an intermediate variable.  This is a subtle
+     optimization, where given enough registers, all operations on the exponent
+     happen in registers and the result is written out only once into EZ.  */
+  int e = EX * 2;
+
+  /* Is there a carry beyond the most significant digit?  */
+  if (__glibc_unlikely (Y[1] == 0))
+    {
+      for (i = 1; i <= p; i++)
+	Y[i] = Y[i + 1];
+      e--;
+    }
+
+  EY = e;
+}
+#endif
+
+/* Invert *X and store in *Y.  Relative error bound:
+   - For P = 2: 1.001 * R ^ (1 - P)
+   - For P = 3: 1.063 * R ^ (1 - P)
+   - For P > 3: 2.001 * R ^ (1 - P)
+
+   *X = 0 is not permissible.  */
+static void
+SECTION
+__inv (const mp_no *x, mp_no *y, int p)
+{
+  long i;
+  double t;
+  mp_no z, w;
+  static const int np1[] =
+    { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+  };
+
+  __cpy (x, &z, p);
+  z.e = 0;
+  __mp_dbl (&z, &t, p);
+  t = 1 / t;
+  __dbl_mp (t, y, p);
+  EY -= EX;
+
+  for (i = 0; i < np1[p]; i++)
+    {
+      __cpy (y, &w, p);
+      __mul (x, &w, y, p);
+      __sub (&__mptwo, y, &z, p);
+      __mul (&w, &z, y, p);
+    }
+}
+
+/* Divide *X by *Y and store result in *Z.  X and Y may overlap but not X and Z
+   or Y and Z.  Relative error bound:
+   - For P = 2: 2.001 * R ^ (1 - P)
+   - For P = 3: 2.063 * R ^ (1 - P)
+   - For P > 3: 3.001 * R ^ (1 - P)
+
+   *X = 0 is not permissible.  */
+void
+SECTION
+__dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
+{
+  mp_no w;
+
+  if (X[0] == 0)
+    Z[0] = 0;
+  else
+    {
+      __inv (y, &w, p);
+      __mul (x, &w, z, p);
+    }
+}