Improve performance of sinf and cosf

The second patch improves performance of sinf and cosf using the same algorithms and polynomials. The returned values are identical to sincosf for the same input. ULP definitions for AArch64 and x64 are updated. sinf/cosf througput gains on Cortex-A72: * |x| < 0x1p-12 : 1.2x * |x| < M_PI_4 : 1.8x * |x| < 2 * M_PI: 1.7x * |x| < 120.0 : 2.3x * |x| < Inf : 3.0x * NEWS: Mention sinf, cosf, sincosf. * sysdeps/aarch64/libm-test-ulps: Update ULP for sinf, cosf, sincosf. * sysdeps/x86_64/fpu/libm-test-ulps: Update ULP for sinf and cosf. * sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c: Add definitions of constants rather than including generic sincosf.h. * sysdeps/x86_64/fpu/s_sincosf_data.c: Remove. * sysdeps/ieee754/flt-32/s_cosf.c (cosf): Rewrite. * sysdeps/ieee754/flt-32/s_sincosf.h (reduced_sin): Remove. (reduced_cos): Remove. (sinf_poly): New function. * sysdeps/ieee754/flt-32/s_sinf.c (sinf): Rewrite.
author: Wilco Dijkstra <wdijkstr@arm.com> 2018-08-14 10:45:59 +0100
committer: Wilco Dijkstra <wdijkstr@arm.com> 2018-08-14 10:45:59 +0100
commit: 599cf3976679e1b345307d9c02057f02aa95528f (patch)
tree: ede9ff73c0eb51bce1a9c540b6daf5c0675b5afd /sysdeps/ieee754/flt-32/s_sincosf.h
parent: e95c6f61920a0f9237cfb292fa44ad500e1df09b (diff)
download: glibc-599cf3976679e1b345307d9c02057f02aa95528f.tar.gz
glibc-599cf3976679e1b345307d9c02057f02aa95528f.tar.xz
glibc-599cf3976679e1b345307d9c02057f02aa95528f.zip
1 files changed, 31 insertions, 140 deletions
diff --git a/sysdeps/ieee754/flt-32/s_sincosf.h b/sysdeps/ieee754/flt-32/s_sincosf.h
index d3d7b4d6f3..1dcb04f235 100644
--- a/sysdeps/ieee754/flt-32/s_sincosf.h
+++ b/sysdeps/ieee754/flt-32/s_sincosf.h
@@ -1,5 +1,5 @@
 /* Used by sinf, cosf and sincosf functions.
-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   Copyright (C) 2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,145 +20,6 @@
 #include <math.h>
 #include "math_config.h"
 
-/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
-static const double C0 = -0x1.ffffffffe98aep-2;
-static const double C1 =  0x1.55555545c50c7p-5;
-static const double C2 = -0x1.6c16b348b6874p-10;
-static const double C3 =  0x1.a00eb9ac43ccp-16;
-static const double C4 = -0x1.23c97dd8844d7p-22;
-
-/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
-static const double S0 = -0x1.5555555551cd9p-3;
-static const double S1 =  0x1.1111110c2688bp-7;
-static const double S2 = -0x1.a019f8b4bd1f9p-13;
-static const double S3 =  0x1.71d7264e6b5b4p-19;
-static const double S4 = -0x1.a947e1674b58ap-26;
-
-/* Chebyshev constants for sin, range 2^-27 - 2^-5.  */
-static const double SS0 = -0x1.555555543d49dp-3;
-static const double SS1 =  0x1.110f475cec8c5p-7;
-
-/* Chebyshev constants for cos, range 2^-27 - 2^-5.  */
-static const double CC0 = -0x1.fffffff5cc6fdp-2;
-static const double CC1 =  0x1.55514b178dac5p-5;
-
-/* PI/2 with 98 bits of accuracy.  */
-static const double PI_2_hi = 0x1.921fb544p+0;
-static const double PI_2_lo = 0x1.0b4611a626332p-34;
-
-static const double SMALL = 0x1p-50; /* 2^-50.  */
-static const double inv_PI_4 = 0x1.45f306dc9c883p+0; /* 4/PI.  */
-
-#define FLOAT_EXPONENT_SHIFT 23
-#define FLOAT_EXPONENT_BIAS 127
-
-static const double pio2_table[] = {
-  0 * M_PI_2,
-  1 * M_PI_2,
-  2 * M_PI_2,
-  3 * M_PI_2,
-  4 * M_PI_2,
-  5 * M_PI_2
-};
-
-static const double invpio4_table[] = {
-  0x0p+0,
-  0x1.45f306cp+0,
-  0x1.c9c882ap-28,
-  0x1.4fe13a8p-58,
-  0x1.f47d4dp-85,
-  0x1.bb81b6cp-112,
-  0x1.4acc9ep-142,
-  0x1.0e4107cp-169
-};
-
-static const double ones[] = { 1.0, -1.0 };
-
-/* Compute the sine value using Chebyshev polynomials where
-   THETA is the range reduced absolute value of the input
-   and it is less than Pi/4,
-   N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide
-   whether a sine or cosine approximation is more accurate and
-   SIGNBIT is used to add the correct sign after the Chebyshev
-   polynomial is computed.  */
-static inline float
-reduced_sin (const double theta, const unsigned int n,
-	 const unsigned int signbit)
-{
-  double sx;
-  const double theta2 = theta * theta;
-  /* We are operating on |x|, so we need to add back the original
-     signbit for sinf.  */
-  double sign;
-  /* Determine positive or negative primary interval.  */
-  sign = ones[((n >> 2) & 1) ^ signbit];
-  /* Are we in the primary interval of sin or cos?  */
-  if ((n & 2) == 0)
-    {
-      /* Here sinf() is calculated using sin Chebyshev polynomial:
-	x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
-      sx = S3 + theta2 * S4;     /* S3+x^2*S4.  */
-      sx = S2 + theta2 * sx;     /* S2+x^2*(S3+x^2*S4).  */
-      sx = S1 + theta2 * sx;     /* S1+x^2*(S2+x^2*(S3+x^2*S4)).  */
-      sx = S0 + theta2 * sx;     /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))).  */
-      sx = theta + theta * theta2 * sx;
-    }
-  else
-    {
-     /* Here sinf() is calculated using cos Chebyshev polynomial:
-	1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
-      sx = C3 + theta2 * C4;     /* C3+x^2*C4.  */
-      sx = C2 + theta2 * sx;     /* C2+x^2*(C3+x^2*C4).  */
-      sx = C1 + theta2 * sx;     /* C1+x^2*(C2+x^2*(C3+x^2*C4)).  */
-      sx = C0 + theta2 * sx;     /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))).  */
-      sx = 1.0 + theta2 * sx;
-    }
-
-  /* Add in the signbit and assign the result.  */
-  return sign * sx;
-}
-
-/* Compute the cosine value using Chebyshev polynomials where
-   THETA is the range reduced absolute value of the input
-   and it is less than Pi/4,
-   N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide
-   whether a sine or cosine approximation is more accurate and
-   the sign of the result.  */
-static inline float
-reduced_cos (double theta, unsigned int n)
-{
-  double sign, cx;
-  const double theta2 = theta * theta;
-
-  /* Determine positive or negative primary interval.  */
-  n += 2;
-  sign = ones[(n >> 2) & 1];
-
-  /* Are we in the primary interval of sin or cos?  */
-  if ((n & 2) == 0)
-    {
-      /* Here cosf() is calculated using sin Chebyshev polynomial:
-	x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
-      cx = S3 + theta2 * S4;
-      cx = S2 + theta2 * cx;
-      cx = S1 + theta2 * cx;
-      cx = S0 + theta2 * cx;
-      cx = theta + theta * theta2 * cx;
-    }
-  else
-    {
-     /* Here cosf() is calculated using cos Chebyshev polynomial:
-	1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
-      cx = C3 + theta2 * C4;
-      cx = C2 + theta2 * cx;
-      cx = C1 + theta2 * cx;
-      cx = C0 + theta2 * cx;
-      cx = 1. + theta2 * cx;
-    }
-  return sign * cx;
-}
-
-
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
@@ -217,6 +78,36 @@ sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
   *cosp = c + x6 * c2;
 }
 
+/* Return the sine of inputs X and X2 (X squared) using the polynomial P.
+   N is the quadrant, and if odd the cosine polynomial is used.  */
+static inline float
+sinf_poly (double x, double x2, const sincos_t *p, int n)
+{
+  double x3, x4, x6, x7, s, c, c1, c2, s1;
+
+  if ((n & 1) == 0)
+    {
+      x3 = x * x2;
+      s1 = p->s2 + x2 * p->s3;
+
+      x7 = x3 * x2;
+      s = x + x3 * p->s1;
+
+      return s + x7 * s1;
+    }
+  else
+    {
+      x4 = x2 * x2;
+      c2 = p->c3 + x2 * p->c4;
+      c1 = p->c0 + x2 * p->c1;
+
+      x6 = x4 * x2;
+      c = c1 + x4 * p->c2;
+
+      return c + x6 * c2;
+    }
+}
+
 /* Fast range reduction using single multiply-subtract.  Return the modulo of
    X as a value between -PI/4 and PI/4 and store the quadrant in NP.
    The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
author	Wilco Dijkstra <wdijkstr@arm.com>	2018-08-14 10:45:59 +0100
committer	Wilco Dijkstra <wdijkstr@arm.com>	2018-08-14 10:45:59 +0100
commit	599cf3976679e1b345307d9c02057f02aa95528f (patch)
tree	ede9ff73c0eb51bce1a9c540b6daf5c0675b5afd /sysdeps/ieee754/flt-32/s_sincosf.h
parent	e95c6f61920a0f9237cfb292fa44ad500e1df09b (diff)
download	glibc-599cf3976679e1b345307d9c02057f02aa95528f.tar.gz glibc-599cf3976679e1b345307d9c02057f02aa95528f.tar.xz glibc-599cf3976679e1b345307d9c02057f02aa95528f.zip