ieee754: Remove slow paths from asin and acos

asin and acos have slow paths for rounding the last bit that cause some calls to be 500-1500x slower than average calls. These slow paths are rare, a test of a trillion (1.000.000.000.000) random inputs between -1 and 1 showed 32870 slow calls for acos and 4473 for asin, with most occurrences between -1.0 .. -0.9 and 0.9 .. 1.0. The slow paths claim correct rounding and use __sin32() and __cos32() (which compare two result candidates and return the closest one) as the final step, with the second result candidate (res1) having a small offset applied from res. This suggests that res and res1 are intended to be 1 ULP apart (which makes sense for rounding), barring bugs, allowing us to pick either one and still remain within 1 ULP of the exact result. Remove the slow paths as the accuracy is better than 1 ULP even without them, which is enough for glibc. Also remove code comments claiming correctly rounded results. After slow path removal, checking the accuracy of 14.400.000.000 random asin() and acos() inputs showed only three incorrectly rounded (error > 0.5 ULP) results: - asin(-0x1.ee2b43286db75p-1) (0.500002 ULP, same as before) - asin(-0x1.f692ba202abcp-4) (0.500003 ULP, same as before) - asin(-0x1.9915e876fc062p-1) (0.50000000001 ULP, previously exact) The first two had the same error even before this commit, and they did not use the slow path at all. Checking 4934 known randomly found previously-slow-path asin inputs shows 25 calls with incorrectly rounded results, with a maximum error of 0.500000002 ULP (for 0x1.fcd5742999ab8p-1). The previous slow-path code rounded all these inputs correctly (error < 0.5 ULP). The observed average speed increase was 130x. Checking 36240 known randomly found previously-slow-path acos inputs shows 42 calls with incorrectly rounded results, with a maximum error of 0.500000008 ULP (for 0x1.f63845056f35ep-1). The previous "exact" slow-path code showed 34 calls with incorrectly rounded results, with the same maximum error of 0.500000008 ULP (for 0x1.f63845056f35ep-1). The observed average speed increase was 130x. The functions could likely be trimmed more while keeping acceptable accuracy, but this at least gets rid of the egregiously slow cases. Tested on x86_64.
author: Anssi Hannula <anssi.hannula@bitwise.fi> 2020-01-27 12:45:10 +0200
committer: Siddhesh Poyarekar <siddhesh@sourceware.org> 2020-12-18 12:09:23 +0530
commit: f67f9c9af228f6b84579cb8c86312d3a7a206a55 (patch)
tree: ae71bde512cad296b275a769ccf68b8836c32117
parent: 59d572ef613252281e31f867099c43f098319ad7 (diff)
download: glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.tar.gz
glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.tar.xz
glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.zip
1 files changed, 15 insertions, 61 deletions
diff --git a/sysdeps/ieee754/dbl-64/e_asin.c b/sysdeps/ieee754/dbl-64/e_asin.c
index eac3d27fda..8a3b26f664 100644
--- a/sysdeps/ieee754/dbl-64/e_asin.c
+++ b/sysdeps/ieee754/dbl-64/e_asin.c
@@ -25,12 +25,6 @@
 /*               doasin.c sincos32.c dosincos.c mpa.c             */
 /*               sincos.tbl  asincos.tbl  powtwo.tbl root.tbl     */
 /*                                                                */
-/* Ultimate asin/acos routines. Given an IEEE double machine      */
-/* number x, compute the correctly rounded value of               */
-/* arcsin(x)or arccos(x)  according to the function called.       */
-/* Assumption: Machine arithmetic operations are performed in     */
-/* round to nearest mode of IEEE 754 standard.                    */
-/*                                                                */
 /******************************************************************/
 #include "endian.h"
 #include "mydefs.h"
@@ -53,13 +47,7 @@ void __doasin(double x, double dx, double w[]);
 void __dubsin(double x, double dx, double v[]);
 void __dubcos(double x, double dx, double v[]);
 void __docos(double x, double dx, double v[]);
-double __sin32(double x, double res, double res1);
-double __cos32(double x, double res, double res1);
 
-/***************************************************************************/
-/* An ultimate asin routine. Given an IEEE double machine number x         */
-/* it computes the correctly rounded (to nearest) value of arcsin(x)       */
-/***************************************************************************/
 double
 SECTION
 __ieee754_asin(double x){
@@ -100,13 +88,7 @@ __ieee754_asin(double x){
       if (res == res+1.00014*cor) return res;
       else {
 	__doasin(x,0,w);
-	if (w[0]==(w[0]+1.00000001*w[1])) return w[0];
-	else {
-	  y=fabs(x);
-	  res=fabs(w[0]);
-	  res1=fabs(w[0]+1.1*w[1]);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
-	}
+	return w[0];
       }
     }
   }
@@ -137,8 +119,7 @@ __ieee754_asin(double x){
 	if (z>1.0e-27) return (m>0)?min(res,res1):-min(res,res1);
 	else if (z<-1.0e-27) return (m>0)?max(res,res1):-max(res,res1);
 	else {
-	  y=fabs(x);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
+	  return (m>0)?res:-res;
 	}
       }
     }
@@ -170,8 +151,7 @@ __ieee754_asin(double x){
 	if (z>1.0e-27) return (m>0)?min(res,res1):-min(res,res1);
 	else if (z<-1.0e-27) return (m>0)?max(res,res1):-max(res,res1);
 	else {
-	  y=fabs(x);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
+	  return (m>0)?res:-res;
 	}
       }
     }
@@ -205,8 +185,7 @@ __ieee754_asin(double x){
 	if (z>1.0e-27) return (m>0)?min(res,res1):-min(res,res1);
 	else if (z<-1.0e-27) return (m>0)?max(res,res1):-max(res,res1);
 	else {
-	  y=fabs(x);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
+	  return (m>0)?res:-res;
 	}
       }
     }
@@ -243,8 +222,7 @@ __ieee754_asin(double x){
 	if (z>1.0e-27) return (m>0)?min(res,res1):-min(res,res1);
 	else if (z<-1.0e-27) return (m>0)?max(res,res1):-max(res,res1);
 	else {
-	  y=fabs(x);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
+	  return (m>0)?res:-res;
 	}
       }
     }
@@ -282,8 +260,7 @@ __ieee754_asin(double x){
 	if (z>1.0e-27) return (m>0)?min(res,res1):-min(res,res1);
 	else if (z<-1.0e-27) return (m>0)?max(res,res1):-max(res,res1);
 	else {
-	  y=fabs(x);
-	  return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
+	  return (m>0)?res:-res;
 	}
       }
     }
@@ -313,13 +290,7 @@ __ieee754_asin(double x){
       res1=hp0.x-2.0*w[0];
       cor=((hp0.x-res1)-2.0*w[0])+(hp1.x-2.0*w[1]);
       res = res1+cor;
-      cor = (res1-res)+cor;
-      if (res==(res+1.0000001*cor)) return (m>0)?res:-res;
-      else {
-	y=fabs(x);
-	res1=res+1.1*cor;
-	return (m>0)?__sin32(y,res,res1):-__sin32(y,res,res1);
-      }
+      return (m>0)?res:-res;
     }
   }    /*   else  if (k < 0x3ff00000)    */
   /*---------------------------- |x|>=1 -------------------------------*/
@@ -388,12 +359,7 @@ __ieee754_acos(double x)
 	r=hp0.x-w[0];
 	cor=((hp0.x-r)-w[0])+(hp1.x-w[1]);
 	res=r+cor;
-	cor=(r-res)+cor;
-	if (res ==(res +1.00000001*cor)) return res;
-	else {
-	  res1=res+1.1*cor;
-	  return __cos32(x,res,res1);
-	}
+	return res;
       }
     }
   }    /*   else  if (k < 0x3fc00000)    */
@@ -429,7 +395,7 @@ __ieee754_acos(double x)
 	z=(w[0]-x)+w[1];
 	if (z>1.0e-27) return max(res,res1);
 	else if (z<-1.0e-27) return min(res,res1);
-	else return __cos32(x,res,res1);
+	else return res;
       }
     }
   }    /*   else  if (k < 0x3fe00000)    */
@@ -464,7 +430,7 @@ __ieee754_acos(double x)
        z=(w[0]-x)+w[1];
        if (z>1.0e-27) return max(res,res1);
        else if (z<-1.0e-27) return min(res,res1);
-       else return __cos32(x,res,res1);
+       else return res;
      }
    }
   }    /*   else  if (k < 0x3fe80000)    */
@@ -499,7 +465,7 @@ __ieee754_acos(double x)
 	z=(w[0]-x)+w[1];
 	if (z>1.0e-27) return max(res,res1);
 	else if (z<-1.0e-27) return min(res,res1);
-	else return __cos32(x,res,res1);
+	else return res;
       }
     }
   }    /*   else  if (k < 0x3fed8000)    */
@@ -535,7 +501,7 @@ __ieee754_acos(double x)
 	z=(w[0]-x)+w[1];
 	if (z>1.0e-27) return max(res,res1);
 	else if (z<-1.0e-27) return min(res,res1);
-	else return __cos32(x,res,res1);
+	else return res;
       }
     }
   }    /*   else  if (k < 0x3fee8000)    */
@@ -571,7 +537,7 @@ __ieee754_acos(double x)
        z=(w[0]-x)+w[1];
        if (z>1.0e-27) return max(res,res1);
        else if (z<-1.0e-27) return min(res,res1);
-       else return __cos32(x,res,res1);
+       else return res;
      }
    }
   }    /*   else  if (k < 0x3fef0000)    */
@@ -602,13 +568,7 @@ __ieee754_acos(double x)
 	res1=hp0.x-w[0];
 	cor=((hp0.x-res1)-w[0])+(hp1.x-w[1]);
 	res = res1+cor;
-	cor = (res1-res)+cor;
-	if (res==(res+1.000001*cor)) return (res+res);
-	else {
-	  res=res+res;
-	  res1=res+1.2*cor;
-	  return __cos32(x,res,res1);
-	}
+	return (res+res);
       }
     }
     else {
@@ -620,13 +580,7 @@ __ieee754_acos(double x)
 	cc=(y-c)+cc;
 	__doasin(c,cc,w);
 	res = w[0];
-	cor=w[1];
-	if (res==(res+1.000001*cor)) return (res+res);
-	else {
-	  res=res+res;
-	  res1=res+1.2*cor;
-	  return __cos32(x,res,res1);
-	}
+	return (res+res);
       }
     }
   }    /*   else  if (k < 0x3ff00000)    */
author	Anssi Hannula <anssi.hannula@bitwise.fi>	2020-01-27 12:45:10 +0200
committer	Siddhesh Poyarekar <siddhesh@sourceware.org>	2020-12-18 12:09:23 +0530
commit	f67f9c9af228f6b84579cb8c86312d3a7a206a55 (patch)
tree	ae71bde512cad296b275a769ccf68b8836c32117
parent	59d572ef613252281e31f867099c43f098319ad7 (diff)
download	glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.tar.gz glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.tar.xz glibc-f67f9c9af228f6b84579cb8c86312d3a7a206a55.zip