diff options
Diffstat (limited to 'sysdeps/aarch64/fpu')
-rw-r--r-- | sysdeps/aarch64/fpu/advsimd_utils.h | 39 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cos_advsimd.c | 82 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cos_sve.c | 74 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cosf_advsimd.c | 77 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cosf_sve.c | 72 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/sv_math.h | 141 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/sve_utils.h | 55 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/v_math.h | 145 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/vecmath_config.h | 38 |
9 files changed, 607 insertions, 116 deletions
diff --git a/sysdeps/aarch64/fpu/advsimd_utils.h b/sysdeps/aarch64/fpu/advsimd_utils.h deleted file mode 100644 index 8a0fcc0e06..0000000000 --- a/sysdeps/aarch64/fpu/advsimd_utils.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Helpers for Advanced SIMD vector math functions. - - Copyright (C) 2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <arm_neon.h> - -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -static __always_inline float32x4_t -v_call_f32 (float (*f) (float), float32x4_t x) -{ - return (float32x4_t){ f (x[0]), f (x[1]), f (x[2]), f (x[3]) }; -} - -static __always_inline float64x2_t -v_call_f64 (double (*f) (double), float64x2_t x) -{ - return (float64x2_t){ f (x[0]), f (x[1]) }; -} diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c index 40831e6b0d..e8f1d10540 100644 --- a/sysdeps/aarch64/fpu/cos_advsimd.c +++ b/sysdeps/aarch64/fpu/cos_advsimd.c @@ -17,13 +17,83 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "v_math.h" -#include "advsimd_utils.h" +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} -VPCS_ATTR -float64x2_t -V_NAME_D1 (cos) (float64x2_t x) +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) { - return v_call_f64 (cos, x); + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (__glibc_unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (d->range_val, x); + cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (__glibc_unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/cos_sve.c b/sysdeps/aarch64/fpu/cos_sve.c index 55501e5000..d7a9b134da 100644 --- a/sysdeps/aarch64/fpu/cos_sve.c +++ b/sysdeps/aarch64/fpu/cos_sve.c @@ -17,12 +17,76 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "sv_math.h" -#include "sve_utils.h" +static const struct data +{ + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; +} data = { + /* Polynomial coefficients are hardwired in FTMAD instructions. */ + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2_1 = 0x1.921fb50000000p+0, + .pio2_2 = 0x1.110b460000000p-26, + .pio2_3 = 0x1.1a62633145c07p-54, + /* Original shift used in AdvSIMD cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.8000000000001p52 +}; + +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t out_of_bounds) +{ + return sv_call_f64 (cos, x, y, out_of_bounds); +} -svfloat64_t -SV_NAME_D1 (cos) (svfloat64_t x, svbool_t pg) +/* A fast SVE implementation of cos based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.108 ULPs. + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) { - return sv_call_f64 (cos, x, svdup_n_f64 (0), pg); + const struct data *d = ptr_barrier (&data); + + svfloat64_t r = svabs_f64_x (pg, x); + svbool_t out_of_bounds + = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svbool_t ptrue = svptrue_b64 (); + svfloat64_t invpio2_and_pio2_1 = svld1rq_f64 (ptrue, &d->inv_pio2); + svfloat64_t pio2_23 = svld1rq_f64 (ptrue, &d->pio2_2); + + /* n = rint(|x|/(pi/2)). */ + svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); + svfloat64_t n = svsub_n_f64_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1); + r = svmls_lane_f64 (r, n, pio2_23, 0); + r = svmls_lane_f64 (r, n, pio2_23, 1); + + /* cos(r) poly approx. */ + svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q)); + svfloat64_t y = sv_f64 (0.0); + y = svtmad_f64 (y, r2, 7); + y = svtmad_f64 (y, r2, 6); + y = svtmad_f64 (y, r2, 5); + y = svtmad_f64 (y, r2, 4); + y = svtmad_f64 (y, r2, 3); + y = svtmad_f64 (y, r2, 2); + y = svtmad_f64 (y, r2, 1); + y = svtmad_f64 (y, r2, 0); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q)); + /* Apply factor. */ + y = svmul_f64_x (pg, f, y); + + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) + return special_case (x, y, out_of_bounds); + return y; } diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c index 35bb81aead..f05dd2bcda 100644 --- a/sysdeps/aarch64/fpu/cosf_advsimd.c +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c @@ -17,13 +17,78 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "v_math.h" -#include "advsimd_utils.h" +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} -VPCS_ATTR -float32x4_t -V_NAME_F1 (cos) (float32x4_t x) +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) { - return v_call_f32 (cosf, x); + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (__glibc_unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (d->range_val, x); + cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (__glibc_unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/cosf_sve.c b/sysdeps/aarch64/fpu/cosf_sve.c index 16c68f387b..577cbd864e 100644 --- a/sysdeps/aarch64/fpu/cosf_sve.c +++ b/sysdeps/aarch64/fpu/cosf_sve.c @@ -17,12 +17,74 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "sv_math.h" -#include "sve_utils.h" +static const struct data +{ + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; +} data = { + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ + .neg_pio2_1 = -0x1.921fb6p+0f, + .neg_pio2_2 = 0x1.777a5cp-25f, + .neg_pio2_3 = 0x1.ee59dap-50f, + .inv_pio2 = 0x1.45f306p-1f, + /* Original shift used in AdvSIMD cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.800002p+23f +}; + +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ -svfloat32_t -SV_NAME_F1 (cos) (svfloat32_t x, svbool_t pg) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds) { - return sv_call_f32 (cosf, x, svdup_n_f32 (0), pg); + return sv_call_f32 (cosf, x, y, out_of_bounds); +} + +/* A fast SVE implementation of cosf based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.06 ULPs. + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t r = svabs_f32_x (pg, x); + svbool_t out_of_bounds + = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t negpio2_and_invpio2 + = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1); + + /* n = rint(|x|/(pi/2)). */ + svfloat32_t q + = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); + svfloat32_t n = svsub_n_f32_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0); + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1); + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q)); + + /* cos(r) poly approx. */ + svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q)); + svfloat32_t y = sv_f32 (0.0f); + y = svtmad_f32 (y, r2, 4); + y = svtmad_f32 (y, r2, 3); + y = svtmad_f32 (y, r2, 2); + y = svtmad_f32 (y, r2, 1); + y = svtmad_f32 (y, r2, 0); + + /* Apply factor. */ + y = svmul_f32_x (pg, f, y); + + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) + return special_case (x, y, out_of_bounds); + return y; } diff --git a/sysdeps/aarch64/fpu/sv_math.h b/sysdeps/aarch64/fpu/sv_math.h new file mode 100644 index 0000000000..fc8e690e80 --- /dev/null +++ b/sysdeps/aarch64/fpu/sv_math.h @@ -0,0 +1,141 @@ +/* Utilities for SVE libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef SV_MATH_H +#define SV_MATH_H + +#include <arm_sve.h> +#include <stdbool.h> + +#include "vecmath_config.h" + +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun + +/* Double precision. */ +static inline svint64_t +sv_s64 (int64_t x) +{ + return svdup_n_s64 (x); +} + +static inline svuint64_t +sv_u64 (uint64_t x) +{ + return svdup_n_u64 (x); +} + +static inline svfloat64_t +sv_f64 (double x) +{ + return svdup_n_f64 (x); +} + +static inline svfloat64_t +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem = svclastb_n_f64 (p, 0, x); + elem = (*f) (elem); + svfloat64_t y2 = svdup_n_f64 (elem); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svfloat64_t +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem1 = svclastb_n_f64 (p, 0, x1); + double elem2 = svclastb_n_f64 (p, 0, x2); + double ret = (*f) (elem1, elem2); + svfloat64_t y2 = svdup_n_f64 (ret); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svuint64_t +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) +{ + svuint64_t q = svdiv_n_u64_x (pg, x, y); + return svmls_n_u64_x (pg, x, q, y); +} + +/* Single precision. */ +static inline svint32_t +sv_s32 (int32_t x) +{ + return svdup_n_s32 (x); +} + +static inline svuint32_t +sv_u32 (uint32_t x) +{ + return svdup_n_u32 (x); +} + +static inline svfloat32_t +sv_f32 (float x) +{ + return svdup_n_f32 (x); +} + +static inline svfloat32_t +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem = svclastb_n_f32 (p, 0, x); + elem = f (elem); + svfloat32_t y2 = svdup_n_f32 (elem); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +static inline svfloat32_t +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, + svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem1 = svclastb_n_f32 (p, 0, x1); + float elem2 = svclastb_n_f32 (p, 0, x2); + float ret = f (elem1, elem2); + svfloat32_t y2 = svdup_n_f32 (ret); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +#endif diff --git a/sysdeps/aarch64/fpu/sve_utils.h b/sysdeps/aarch64/fpu/sve_utils.h deleted file mode 100644 index 5ce3d2e8d6..0000000000 --- a/sysdeps/aarch64/fpu/sve_utils.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Helpers for SVE vector math functions. - - Copyright (C) 2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <arm_sve.h> - -#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f -#define SV_NAME_D1(fun) _ZGVsMxv_##fun -#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f -#define SV_NAME_D2(fun) _ZGVsMxvv_##fun - -static __always_inline svfloat32_t -sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) -{ - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - float elem = svclastb_n_f32 (p, 0, x); - elem = (*f) (elem); - svfloat32_t y2 = svdup_n_f32 (elem); - y = svsel_f32 (p, y2, y); - p = svpnext_b32 (cmp, p); - } - return y; -} - -static __always_inline svfloat64_t -sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) -{ - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - double elem = svclastb_n_f64 (p, 0, x); - elem = (*f) (elem); - svfloat64_t y2 = svdup_n_f64 (elem); - y = svsel_f64 (p, y2, y); - p = svpnext_b64 (cmp, p); - } - return y; -} diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h new file mode 100644 index 0000000000..43efd8f99d --- /dev/null +++ b/sysdeps/aarch64/fpu/v_math.h @@ -0,0 +1,145 @@ +/* Utilities for Advanced SIMD libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#include <arm_neon.h> +#include "vecmath_config.h" + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +/* Shorthand helpers for declaring constants. */ +#define V2(x) \ + { \ + x, x \ + } + +#define V4(x) \ + { \ + x, x, x, x \ + } + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +static inline int32x4_t +v_s32 (int32_t x) +{ + return (int32x4_t) V4 (x); +} + +/* true if any elements of a vector compare result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; +} + +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +static inline int64x2_t +v_s64 (int64_t x) +{ + return (int64x2_t) V2 (x); +} + +/* true if any elements of a vector compare result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +/* true if all elements of a vector compare result is 1. */ +static inline int +v_all_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] }; +} +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) +{ + return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1] }; +} + +#endif diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h new file mode 100644 index 0000000000..d0bdbb4ae8 --- /dev/null +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -0,0 +1,38 @@ +/* Configuration for libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _VECMATH_CONFIG_H +#define _VECMATH_CONFIG_H + +#include <math_private.h> + +/* Deprecated config option from Arm Optimized Routines which ensures + fp exceptions are correctly triggered. This is not intended to be + supported in GLIBC, however we keep it for ease of development. */ +#define WANT_SIMD_EXCEPT 0 + +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +#endif |