diff options
author | Shawn Landden <shawn@git.icu> | 2019-07-04 17:56:46 -0400 |
---|---|---|
committer | Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com> | 2020-02-19 17:29:23 -0300 |
commit | 4068846e1e364b2441b93cf3d731a6599596726a (patch) | |
tree | 2b92d418d2f8db7a81a63029c424e1deffed4fb6 | |
parent | d91313ed6fb18acfb4f7b0399fd18e87f5c00221 (diff) | |
download | glibc-4068846e1e364b2441b93cf3d731a6599596726a.tar.gz glibc-4068846e1e364b2441b93cf3d731a6599596726a.tar.xz glibc-4068846e1e364b2441b93cf3d731a6599596726a.zip |
PPC64: Add libmvec SIMD single-precision power function [BZ #24210]
Based off the ./sysdeps/ieee754/flt-32/powf.c implementation, and thus provides identical results. Unlike other libmvec functions, this sets the underflow and overflow bits. The caller can check these flags, and possibly re-run the calculations with scalar powf to figure out what is causing the overflow or underflow. I may have not normalized the data for benchmarking this properly, but operating only on floats between 0.5 and 1 I get the following: Running 20 times over 32MiB vector: mean 307.659767 (sd 0.203217) scalar: mean 221.837088 (sd 0.032256) And with random data there is a decrease in performance: vector: mean 265.366371 (sd 0.000626) scalar: mean 279.598078 (sd 0.025592) Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | sysdeps/powerpc/bits/math-vector.h | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/fpu/Versions | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c | 1 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c | 12 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c | 336 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist | 1 |
8 files changed, 357 insertions, 2 deletions
diff --git a/NEWS b/NEWS index 2d1002a489..81f12f6038 100644 --- a/NEWS +++ b/NEWS @@ -270,6 +270,7 @@ Major new features: - single-precision sincos: sincosf - double-precision exp: exp - single-precision exp: expf + - single-precision pow: powf GCC support for auto-vectorization of functions on PPC64 is not yet available. Until that is done, the new vector math functions are diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h index e7846b4bfa..5709efbae0 100644 --- a/sysdeps/powerpc/bits/math-vector.h +++ b/sysdeps/powerpc/bits/math-vector.h @@ -48,6 +48,8 @@ # define __DECL_SIMD_expf __DECL_SIMD_PPC64 # undef __DECL_SIMD_exp # define __DECL_SIMD_exp __DECL_SIMD_PPC64 +# undef __DECL_SIMD_powf +# define __DECL_SIMD_powf __DECL_SIMD_PPC64 # endif #endif diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions index 361edee9e7..225f7c8475 100644 --- a/sysdeps/powerpc/powerpc64/fpu/Versions +++ b/sysdeps/powerpc/powerpc64/fpu/Versions @@ -2,6 +2,6 @@ libmvec { GLIBC_2.30 { _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf; _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf; - _ZGVbN4v_expf; _ZGVbN2v_exp; + _ZGVbN4v_expf; _ZGVbN2v_exp; _ZGVbN4vv_powf; } } diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile index 9275836a26..6d02e725bc 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile @@ -4,6 +4,7 @@ libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \ vec_d_log2_vsx vec_d_log_data \ vec_s_logf4_vsx vec_s_logf_data \ vec_s_expf4_vsx vec_s_exp2f_data \ + vec_s_powf4_vsx e_powf_log2_data \ vec_math_errf vec_math_err \ vec_d_exp2_vsx vec_d_exp_data \ vec_d_sincos2_vsx vec_s_sincosf4_vsx @@ -22,6 +23,7 @@ CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx +CFLAGS-vec_s_powf4_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector CFLAGS-vec_math_err.c += -mabi=altivec -maltivec -mvsx endif @@ -31,7 +33,7 @@ ifeq ($(build-mathvec),yes) libmvec-tests += double-vlen2 float-vlen4 double-vlen2-funcs = cos sin sincos log exp -float-vlen4-funcs = cos sin sincos log exp +float-vlen4-funcs = cos sin sincos log exp pow double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c index 37886d1a45..d98c11651e 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c @@ -25,4 +25,5 @@ VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c index 1d145ffde1..1f5042263d 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c @@ -36,3 +36,15 @@ __math_oflowf (uint32_t sign) { return xflowf (sign, 0x1p97f); } + +attribute_hidden float +__math_invalidf (float x) +{ + return (x - x) / (x - x); +} + +attribute_hidden float +__math_divzerof (uint32_t sign) +{ + return (float)(sign ? -1.0f : 1.0f) / 0.0f; +} diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c new file mode 100644 index 0000000000..f975784933 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c @@ -0,0 +1,336 @@ +/* Single-precision vector pow function. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <math-barriers.h> +#include <stdint.h> +#include "math_config_flt.h" + +typedef vector long long unsigned v64u; +/* +POWF_LOG2_POLY_ORDER = 5 +EXP2F_TABLE_BITS = 5 + +ULP error: 0.82 (~ 0.5 + relerr*2^24) +relerr: 1.27 * 2^-26 (Relative error ~= 128*Ln2*relerr_log2 + relerr_exp2) +relerr_log2: 1.83 * 2^-33 (Relative error of logx.) +relerr_exp2: 1.69 * 2^-34 (Relative error of exp2(ylogx).) +*/ + +#define N (1 << POWF_LOG2_TABLE_BITS) +#define T __powf_log2_data.tab +#define A __powf_log2_data.poly +#define OFF 0x3f330000 + + +/* Subnormal input is normalized so ix has negative biased exponent. + Output is multiplied by N (POWF_SCALE) if TOINT_INTRINICS is set. */ +struct two_v_doubles { + vector double l; + vector double r; +}; + +static inline struct two_v_doubles +log2_inline (vector unsigned ix) +{ + vector float z; + vector double rl, rr, r2l, r2r, r4l, r4r, pl, pr, ql, qr, yl, yr, y0l, y0r; + vector unsigned iz, top, tmp; + vector signed k, i; + struct two_v_doubles ret; + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (vector signed)(tmp >> (23 - POWF_LOG2_TABLE_BITS)) % N; + top = tmp & 0xff800000; + iz = ix - top; + k = (vector signed)top >> (23 - POWF_SCALE_BITS); /* arithmetic shift */ + vector double invcl = {T[i[0]].invc, T[i[1]].invc}; + vector double invcr = {T[i[2]].invc, T[i[3]].invc}; + vector double logcl = {T[i[0]].logc, T[i[1]].logc}; + vector double logcr = {T[i[2]].logc, T[i[3]].logc}; + z = vasfloat(iz); + vector double zl = {z[0], z[1]}; + vector double zr = {z[2], z[3]}; + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ + rl = zl * invcl - 1; + rr = zr * invcr - 1; + vector double kl = {(double)k[0], (double)k[1]}; + vector double kr = {(double)k[2], (double)k[3]}; + y0l = logcl + kl; + y0r = logcr + kr; + + /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */ + r2l = rl * rl; + r2r = rr * rr; + yl = A[0] * rl + A[1]; + yr = A[0] * rr + A[1]; + pl = A[2] * rl + A[3]; + pr = A[2] * rr + A[3]; + r4l = r2l * r2l; + r4r = r2r * r2r; + ql = A[4] * rl + y0l; + qr = A[4] * rr + y0r; + ql = pl * r2l + ql; + qr = pr * r2r + qr; + yl = yl * r4l + ql; + yr = yr * r4r + qr; + ret.l = yl; + ret.r = yr; + return ret; +} + +#undef N +#undef T +#define N (1 << EXP2F_TABLE_BITS) +#define T __exp2f_data.tab +#define SIGN_BIAS (1 << (EXP2F_TABLE_BITS + 11)) + +/* The output of log2 and thus the input of exp2 is either scaled by N + (in case of fast toint intrinsics) or not. The unscaled xd must be + in [-1021,1023], sign_bias sets the sign of the result. */ +static inline vector float +exp2_inline (vector double xdl, vector double xdr, vector unsigned sign_bias) +{ + v64u kil, kir, skil, skir, sign_biasl, sign_biasr; + vector double kdl, kdr, zl, zr, rl, rr, r2l, r2r, yl, yr, sl, sr; + + vector unsigned zero = {0, 0, 0, 0}; +#ifdef __LITTLE_ENDIAN__ + sign_biasl = (v64u) vec_mergeh (sign_bias, zero); + sign_biasr = (v64u) vec_mergel (sign_bias, zero); +#else + sign_biasl = (v64u) vec_mergel (zero, sign_bias); + sign_biasr = (v64u) vec_mergeh (zero, sign_bias); +#endif +#define C __exp2f_data.poly +#define SHIFT __exp2f_data.shift_scaled + /* x = k/N + r with r in [-1/(2N), 1/(2N)] */ + kdl = xdl + SHIFT; + kdr = xdr + SHIFT; + kil = vasuint64 (kdl); + kir = vasuint64 (kdr); + kdl -= SHIFT; + kdr -= SHIFT; /* k/N */ + rl = xdl - kdl; + rr = xdr - kdr; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + v64u tl = {T[kil[0] % N], T[kil[1] % N]}; + v64u tr = {T[kir[0] % N], T[kir[1] % N]}; + skil = kil + sign_biasl; + skir = kir + sign_biasr; + tl += skil << (52 - EXP2F_TABLE_BITS); + tr += skir << (52 - EXP2F_TABLE_BITS); + sl = vasdouble(tl); + sr = vasdouble(tr); + zl = C[0] * rl + C[1]; + zr = C[0] * rr + C[1]; + r2l = rl * rl; + r2r = rr * rr; + yl = C[2] * rl + 1; + yr = C[2] * rr + 1; + yl = zl * r2l + yl; + yr = zr * r2r + yr; + yl = yl * sl; + yr = yr * sr; + /* There is no vector pack/unpack for 64<->32 */ + vector float res = {(float)yl[0], (float)yl[1], (float)yr[0], (float)yr[1]}; + return res; +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline vector unsigned +checkint (vector unsigned iy) +{ + vector unsigned e = iy >> 23 & 0xff; + vector unsigned zero = {0, 0, 0, 0}; + vector unsigned not_matched = ~zero; + vector unsigned res; + vector unsigned is_first = (vector unsigned) vec_cmplt (e, zero + 0x7f); + not_matched &= ~is_first; + res = zero; + vector unsigned is_second = (vector unsigned) vec_cmpgt (e, zero + 0x7f + 23); + not_matched &= ~is_second; + res = vec_sel (res, zero + 2, is_second); + vector unsigned is_third = + (vector unsigned) vec_cmpne (iy & ((1 << (0x7f + 23 - e)) - 1), zero); + res = vec_sel (res, zero, is_third & not_matched); + not_matched &= ~is_third; + vector unsigned is_four = + (vector unsigned) vec_cmpne (iy & (1 << (0x7f + 23 - e)), zero); + res = vec_sel (res, zero + 1, is_four & not_matched); + not_matched &= ~is_four; + res = vec_sel (res, zero + 2, not_matched); + return res; +} + +static vector unsigned +zeroinfnan (vector unsigned ix) +{ + vector unsigned zero = {0, 0, 0, 0}; + return (vector unsigned) vec_cmpge (2 * ix - 1, zero + (2u * 0x7f800000 - 1)); +} + +vector float +_ZGVbN4vv_powf (vector float x, vector float y) +{ + vector unsigned zero = {0, 0, 0, 0}; + vector unsigned sign_bias = zero; + vector unsigned ix, iy, res = zero, res_mask = zero; + + ix = vasuint (x); + iy = vasuint (y); + vector unsigned special_cst = {0x7f800000 - 0x00800000, + 0x7f800000 - 0x00800000, + 0x7f800000 - 0x00800000, + 0x7f800000 - 0x00800000}; + vector unsigned is_special = (vector unsigned) vec_cmpge (ix - 0x00800000, + special_cst); + vector unsigned is_zeroinfnanx = zeroinfnan(iy); + if (__glibc_unlikely (!vec_all_eq (is_special | is_zeroinfnanx, zero))) + { + if (!vec_all_eq(is_zeroinfnanx, zero)) + { + vector unsigned not_covered = is_zeroinfnanx; + res_mask = is_zeroinfnanx; + vector unsigned is_one = (vector unsigned) vec_cmpeq (2 * iy, zero); + vector float one = {1.0f, 1.0f, 1.0f, 1.0f}; + vector unsigned is_two = + (vector unsigned) vec_cmpeq (ix, zero + 0x3f800000); + res = vec_sel (res, vasuint (one), (is_one | is_two) & not_covered); + not_covered &= ~(is_one | is_two); + vector unsigned is_threea = + (vector unsigned) vec_cmpgt (2 * ix, zero + 2u * 0x7f800000); + vector unsigned is_threeb = + (vector unsigned) vec_cmpgt (2 * iy, zero + 2u * 0x7f800000); + vector float xy = x + y; + res = vec_sel (res, vasuint (xy), (is_threea | is_threeb) + & not_covered); + not_covered &= ~(is_threea | is_threeb); + vector unsigned is_four = + (vector unsigned) vec_cmpeq (2 * ix, zero + 2 * 0x3f800000); + res = vec_sel (res, vasuint (one), is_four & not_covered); + not_covered &= ~is_four; + vector unsigned is_fivea = + (vector unsigned) vec_cmplt (2 * ix, zero + 2 * 0x3f800000); + vector unsigned is_fiveb = + (vector unsigned) vec_cmplt (iy, zero + 0x80000000); + vector unsigned is_five = + (vector unsigned) vec_cmpeq (is_fivea, is_fiveb); + res = vec_sel (res, zero, is_five & not_covered); + not_covered &= ~is_five; + vector float yy = y * y; + res = vec_sel (res, vasuint (yy), not_covered); + } + vector unsigned is_ix = + (vector unsigned) vec_cmpge (ix, zero + 0x80000000); + vector unsigned is_xinfnan = zeroinfnan(ix); + if (!vec_all_eq (is_xinfnan & ~res_mask, zero)) + { + vector float x2 = x * x; + vector unsigned is_checkinty = + (vector unsigned) vec_cmpeq (checkint(iy), zero + 1); + if (!vec_all_eq (is_ix & is_checkinty, zero)) + x2 = vec_sel (x2, -x2, is_ix & is_checkinty); + vector unsigned is_iy = + (vector unsigned) vec_cmpge (iy, zero + 0x80000000); + if (!vec_all_eq (is_iy, zero)) + { + math_force_eval (1 / x2); + x2 = vec_sel (x2, 1 / x2, is_iy); + } + res = vec_sel (res, vasuint(x2), is_xinfnan); + res_mask |= is_xinfnan; + } + vector unsigned is_xneg = + (vector unsigned) vec_cmpgt (ix, zero + 0x80000000); + if (!vec_all_eq (is_xneg, zero)) + { + vector unsigned yint = checkint (iy); + vector unsigned is_invalid = (vector unsigned) vec_cmpeq (yint, zero); + if (!vec_all_eq(is_invalid & ~res_mask, zero)) + { + for (int m=0;m<4;m++) + { + if ((is_invalid & ~res_mask)[m] == 0) + continue; + res[m] = asuint (__math_invalidf (x[m])); + res_mask[m] = 0xffffffff; + } + } + vector unsigned is_one = (vector unsigned) vec_cmpeq (yint, zero + 1); + sign_bias = vec_sel (sign_bias, zero + SIGN_BIAS, is_one); + ix = vec_sel (ix, ix & 0x7fffffff, is_xneg); + } + vector unsigned is_subnormal = + (vector unsigned) vec_cmplt (ix, zero + 0x00800000); + if (!vec_all_eq (is_subnormal & ~res_mask, zero)) + { + vector unsigned subnormals = ix; + subnormals = vasuint (vasfloat(ix) * 0x1p23f); + subnormals = subnormals & (unsigned)0x7ffffffff; + subnormals -= 23 << 23; + ix = vec_sel (ix, subnormals, is_subnormal & ~res_mask); + } + /* If we already filled in all the results, we can return early. */ + if (vec_all_eq (res_mask, ~zero)) + return vasfloat (res); + } + struct two_v_doubles logx = log2_inline (ix); + + vector double yl = {y[0], y[1]}; + vector double yr = {y[2], y[3]}; + vector double ylogxl = yl * logx.l; + vector double ylogxr = yr * logx.r; + v64u overunderflow_const = {asuint64(126.0 * POWF_SCALE) >> 47, + asuint64(126.0 * POWF_SCALE) >> 47}; + v64u is_overunderflowl = (v64u) vec_cmpge + (((vasuint64 (ylogxl) >> 47) & 0xffff), overunderflow_const); + v64u is_overunderflowr = (v64u) vec_cmpge + (((vasuint64(ylogxr) >> 47) & 0xffff), overunderflow_const); + vector unsigned is_overunderflow = vec_pack (is_overunderflowl, + is_overunderflowr); + if (__glibc_unlikely (!vec_all_eq (is_overunderflow, zero))) + { + vector double ylogx = ylogxl; + for (int m=0;m<4;m++) + { + if (is_overunderflow[m] == 0) + continue; + if (m == 2 || m == 3) + ylogx = ylogxr; + if (ylogx[m % 2] > (0x1.fffffffd1d571p+6 * POWF_SCALE)) + { + res[m] = asuint (__math_oflowf (sign_bias[m])); + res_mask[m] = 0xffffffff; + } + else if (ylogx[m % 2] <= (-150.0 * POWF_SCALE)) + { + res[m] = asuint (__math_uflowf (sign_bias[m])); + res_mask[m] = 0xffffffff; + } + } + } + vector unsigned exp2 = vasuint (exp2_inline (ylogxl, ylogxr, sign_bias)); + return vasfloat (vec_sel (exp2, res, res_mask)); +} diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist index 46558dbd77..eedc6c84de 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist @@ -7,4 +7,5 @@ GLIBC_2.30 _ZGVbN4v_cosf F GLIBC_2.30 _ZGVbN4v_expf F GLIBC_2.30 _ZGVbN4v_logf F GLIBC_2.30 _ZGVbN4v_sinf F +GLIBC_2.30 _ZGVbN4vv_powf F GLIBC_2.30 _ZGVbN4vvv_sincosf F |