diff options
Diffstat (limited to 'sysdeps')
232 files changed, 8235 insertions, 566 deletions
diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index 5f2da91ebb..31782b66f9 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -47,13 +47,6 @@ #define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ && MIDR_PARTNUM(midr) == 0xaf) -#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ - && MIDR_PARTNUM(midr) == 0xd0c) -#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ - && MIDR_PARTNUM(midr) == 0xd49) -#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ - && MIDR_PARTNUM(midr) == 0xd40) - #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P' \ && MIDR_PARTNUM(midr) == 0x000) diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile index e8af35099d..234a6c457c 100644 --- a/sysdeps/aarch64/fpu/Makefile +++ b/sysdeps/aarch64/fpu/Makefile @@ -5,6 +5,7 @@ libmvec-supported-funcs = acos \ atan \ atanh \ atan2 \ + cbrt \ cos \ cosh \ erf \ @@ -13,10 +14,12 @@ libmvec-supported-funcs = acos \ exp10 \ exp2 \ expm1 \ + hypot \ log \ log10 \ log1p \ log2 \ + pow \ sin \ sinh \ tan \ @@ -42,7 +45,10 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \ sv_erff_data \ v_exp_tail_data \ erfc_data \ - erfcf_data + erfcf_data \ + v_pow_exp_data \ + v_pow_log_data \ + v_powf_data endif sve-cflags = -march=armv8-a+sve diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions index 3cb1b82bd2..cc15ce2d1e 100644 --- a/sysdeps/aarch64/fpu/Versions +++ b/sysdeps/aarch64/fpu/Versions @@ -94,6 +94,11 @@ libmvec { _ZGVnN4v_atanhf; _ZGVsMxv_atanh; _ZGVsMxv_atanhf; + _ZGVnN2v_cbrt; + _ZGVnN2v_cbrtf; + _ZGVnN4v_cbrtf; + _ZGVsMxv_cbrt; + _ZGVsMxv_cbrtf; _ZGVnN2v_cosh; _ZGVnN2v_coshf; _ZGVnN4v_coshf; @@ -109,6 +114,16 @@ libmvec { _ZGVnN4v_erfcf; _ZGVsMxv_erfc; _ZGVsMxv_erfcf; + _ZGVnN4vv_hypotf; + _ZGVnN2vv_hypotf; + _ZGVnN2vv_hypot; + _ZGVsMxvv_hypotf; + _ZGVsMxvv_hypot; + _ZGVnN4vv_powf; + _ZGVnN2vv_powf; + _ZGVnN2vv_pow; + _ZGVsMxvv_powf; + _ZGVsMxvv_pow; _ZGVnN2v_sinh; _ZGVnN2v_sinhf; _ZGVnN4v_sinhf; diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h index 383c436972..097d403ffe 100644 --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h @@ -23,6 +23,7 @@ libmvec_hidden_proto (V_NAME_F1(asin)); libmvec_hidden_proto (V_NAME_F1(asinh)); libmvec_hidden_proto (V_NAME_F1(atan)); libmvec_hidden_proto (V_NAME_F1(atanh)); +libmvec_hidden_proto (V_NAME_F1(cbrt)); libmvec_hidden_proto (V_NAME_F1(cos)); libmvec_hidden_proto (V_NAME_F1(cosh)); libmvec_hidden_proto (V_NAME_F1(erf)); @@ -31,10 +32,12 @@ libmvec_hidden_proto (V_NAME_F1(exp10)); libmvec_hidden_proto (V_NAME_F1(exp2)); libmvec_hidden_proto (V_NAME_F1(exp)); libmvec_hidden_proto (V_NAME_F1(expm1)); +libmvec_hidden_proto (V_NAME_F2(hypot)); libmvec_hidden_proto (V_NAME_F1(log10)); libmvec_hidden_proto (V_NAME_F1(log1p)); libmvec_hidden_proto (V_NAME_F1(log2)); libmvec_hidden_proto (V_NAME_F1(log)); +libmvec_hidden_proto (V_NAME_F2(pow)); libmvec_hidden_proto (V_NAME_F1(sin)); libmvec_hidden_proto (V_NAME_F1(sinh)); libmvec_hidden_proto (V_NAME_F1(tan)); diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c index 544a52f651..6207e7da95 100644 --- a/sysdeps/aarch64/fpu/asinh_advsimd.c +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c @@ -22,6 +22,7 @@ #define A(i) v_f64 (__v_log_data.poly[i]) #define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) const static struct data { @@ -63,11 +64,15 @@ struct entry static inline struct entry lookup (uint64x2_t i) { - float64x2_t e0 = vld1q_f64 ( - &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - float64x2_t e1 = vld1q_f64 ( - &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) }; + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; } static inline float64x2_t diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c index 2fd6164134..b1e7a9b8fc 100644 --- a/sysdeps/aarch64/fpu/atan2_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c @@ -17,6 +17,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include "math_config.h" #include "v_math.h" #include "poly_advsimd_f64.h" diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c index 04fa71fa37..ed9f683436 100644 --- a/sysdeps/aarch64/fpu/atan2_sve.c +++ b/sysdeps/aarch64/fpu/atan2_sve.c @@ -17,6 +17,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include "math_config.h" #include "sv_math.h" #include "poly_sve_f64.h" diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h index e29b2d1c09..7484150131 100644 --- a/sysdeps/aarch64/fpu/bits/math-vector.h +++ b/sysdeps/aarch64/fpu/bits/math-vector.h @@ -57,6 +57,10 @@ # define __DECL_SIMD_atan2 __DECL_SIMD_aarch64 # undef __DECL_SIMD_atan2f # define __DECL_SIMD_atan2f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_cbrt +# define __DECL_SIMD_cbrt __DECL_SIMD_aarch64 +# undef __DECL_SIMD_cbrtf +# define __DECL_SIMD_cbrtf __DECL_SIMD_aarch64 # undef __DECL_SIMD_cos # define __DECL_SIMD_cos __DECL_SIMD_aarch64 # undef __DECL_SIMD_cosf @@ -89,6 +93,10 @@ # define __DECL_SIMD_expm1 __DECL_SIMD_aarch64 # undef __DECL_SIMD_expm1f # define __DECL_SIMD_expm1f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_hypot +# define __DECL_SIMD_hypot __DECL_SIMD_aarch64 +# undef __DECL_SIMD_hypotf +# define __DECL_SIMD_hypotf __DECL_SIMD_aarch64 # undef __DECL_SIMD_log # define __DECL_SIMD_log __DECL_SIMD_aarch64 # undef __DECL_SIMD_logf @@ -105,6 +113,10 @@ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64 # undef __DECL_SIMD_log2f # define __DECL_SIMD_log2f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_pow +# define __DECL_SIMD_pow __DECL_SIMD_aarch64 +# undef __DECL_SIMD_powf +# define __DECL_SIMD_powf __DECL_SIMD_aarch64 # undef __DECL_SIMD_sin # define __DECL_SIMD_sin __DECL_SIMD_aarch64 # undef __DECL_SIMD_sinf @@ -154,6 +166,7 @@ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); @@ -162,10 +175,12 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); @@ -178,6 +193,7 @@ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); @@ -186,10 +202,12 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); @@ -207,6 +225,7 @@ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t); @@ -215,10 +234,12 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t); @@ -231,6 +252,7 @@ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t); @@ -239,10 +261,12 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t); diff --git a/sysdeps/aarch64/fpu/cbrt_advsimd.c b/sysdeps/aarch64/fpu/cbrt_advsimd.c new file mode 100644 index 0000000000..adfbb60cd3 --- /dev/null +++ b/sysdeps/aarch64/fpu/cbrt_advsimd.c @@ -0,0 +1,121 @@ +/* Double-precision vector (AdvSIMD) cbrt function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +const static struct data +{ + float64x2_t poly[4], one_third, shift; + int64x2_t exp_bias; + uint64x2_t abs_mask, tiny_bound; + uint32x4_t thresh; + double table[5]; +} data = { + .shift = V2 (0x1.8p52), + .poly = { /* Generated with fpminimax in [0.5, 1]. */ + V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1), + V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) }, + .exp_bias = V2 (1022), + .abs_mask = V2(0x7fffffffffffffff), + .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */ + .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */ + .one_third = V2(0x1.5555555555555p-2), + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 } +}; + +#define MantissaMask v_u64 (0x000fffffffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint32x2_t special) +{ + return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order polynomial + and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + /* Subnormal, +/-0 and special values. */ + uint32x2_t special + = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5)); + int64x2_t exp_bias = d->exp_bias; + uint64x2_t ia12 = vshrq_n_u64 (iax, 52); + int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for + Newton iterations. */ + float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); + float64x2_t one_third = d->one_third; + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + float64x2_t m_by_3 = vmulq_f64 (m, one_third); + float64x2_t two_thirds = vaddq_f64 (one_third, one_third); + float64x2_t a + = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p); + a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is + an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + float64x2_t ef = vcvtq_f64_s64 (e); + float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third)); + int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3))); + int64x2_t ey = vcvtq_s64_f64 (eb3f); + + float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] }; + my = vmulq_f64 (my, a); + + /* Vector version of ldexp. */ + float64x2_t y = vreinterpretq_f64_s64 ( + vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52)); + y = vmulq_f64 (y, my); + + if (__glibc_unlikely (v_any_u32h (special))) + return special_case (x, vbslq_f64 (d->abs_mask, y, x), special); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} diff --git a/sysdeps/aarch64/fpu/cbrt_sve.c b/sysdeps/aarch64/fpu/cbrt_sve.c new file mode 100644 index 0000000000..fc976eda2a --- /dev/null +++ b/sysdeps/aarch64/fpu/cbrt_sve.c @@ -0,0 +1,128 @@ +/* Double-precision vector (SVE) cbrt function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f64.h" + +const static struct data +{ + float64_t poly[4]; + float64_t table[5]; + float64_t one_third, two_thirds, shift; + int64_t exp_bias; + uint64_t tiny_bound, thresh; +} data = { + /* Generated with FPMinimax in [0.5, 1]. */ + .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, + 0x1.2c74eaa3ba428p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, }, + .one_third = 0x1.5555555555555p-2, + .two_thirds = 0x1.5555555555555p-1, + .shift = 0x1.8p52, + .exp_bias = 1022, + .tiny_bound = 0x0010000000000000, /* Smallest normal. */ + .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */ +}; + +#define MantissaMask 0x000fffffffffffff +#define HalfExp 0x3fe0000000000000 + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cbrt, x, y, special); +} + +static inline svfloat64_t +shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. + Errors repeat according to the exponent, for instance an error observed for + double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i + is an integer. + _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342 + want 0x1.965f53b0e5d95p-342. */ +svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat64_t m = svreinterpret_f64 (svorr_x ( + pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp)); + svint64_t e + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + svfloat64_t p + = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third); + svint64_t ey = svcvt_s64_x (pg, eb3f); + svint64_t em3 = svmls_x (pg, e, ey, 3); + + svfloat64_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexp. */ + svfloat64_t y = svscale_x (pg, my, ey); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/cbrtf_advsimd.c b/sysdeps/aarch64/fpu/cbrtf_advsimd.c new file mode 100644 index 0000000000..27debb8b57 --- /dev/null +++ b/sysdeps/aarch64/fpu/cbrtf_advsimd.c @@ -0,0 +1,123 @@ +/* Single-precision vector (AdvSIMD) cbrt function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" + +const static struct data +{ + float32x4_t poly[4], one_third; + float table[5]; +} data = { + .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with + FPMinimax. */ + V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1), + V4 (0x1.2c74c2p-3) }, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = V4 (0x1.555556p-2f), +}; + +#define SignMask v_u32 (0x80000000) +#define SmallestNormal v_u32 (0x00800000) +#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask v_u32 (0x007fffff) +#define HalfExp v_u32 (0x3f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint16x4_t special) +{ + return v_call_f32 (cbrtf, x, y, vmovl_u16 (special)); +} + +static inline float32x4_t +shifted_lookup (const float *table, int32x4_t i) +{ + return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], + table[i[3] + 2] }; +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + /* Subnormal, +/-0 and special values. */ + uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); + int32x4_t e + = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126)); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); + + float32x4_t one_third = d->one_third; + float32x4_t two_thirds = vaddq_f32 (one_third, one_third); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float32x4_t m_by_3 = vmulq_f32 (m, one_third); + float32x4_t a + = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third); + int32x4_t ey = vcvtq_s32_f32 (ef); + int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3))); + + float32x4_t my = shifted_lookup (d->table, em3); + my = vmulq_f32 (my, a); + + /* Vector version of ldexpf. */ + float32x4_t y + = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23)); + y = vmulq_f32 (y, my); + + if (__glibc_unlikely (v_any_u16h (special))) + return special_case (x, vbslq_f32 (SignMask, x, y), special); + + /* Copy sign. */ + return vbslq_f32 (SignMask, x, y); +} +libmvec_hidden_def (V_NAME_F1 (cbrt)) +HALF_WIDTH_ALIAS_F1 (cbrt) diff --git a/sysdeps/aarch64/fpu/cbrtf_sve.c b/sysdeps/aarch64/fpu/cbrtf_sve.c new file mode 100644 index 0000000000..23c220c202 --- /dev/null +++ b/sysdeps/aarch64/fpu/cbrtf_sve.c @@ -0,0 +1,122 @@ +/* Single-precision vector (SVE) cbrt function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f32.h" + +const static struct data +{ + float32_t poly[4]; + float32_t table[5]; + float32_t one_third, two_thirds; +} data = { + /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax. + */ + .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, + 0x1.2c74c2p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, +}; + +#define SmallestNormal 0x00800000 +#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (cbrtf, x, y, special); +} + +static inline svfloat32_t +shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat32_t m = svreinterpret_f32 (svorr_x ( + pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp)); + svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + svfloat32_t p + = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third); + svint32_t ey = svcvt_s32_x (pg, ef); + svint32_t em3 = svmls_x (pg, e, ey, 3); + + svfloat32_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexpf. */ + svfloat32_t y = svscale_x (pg, my, ey); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c index ec7b59637e..4bee734f00 100644 --- a/sysdeps/aarch64/fpu/cosh_advsimd.c +++ b/sysdeps/aarch64/fpu/cosh_advsimd.c @@ -22,7 +22,9 @@ static const struct data { float64x2_t poly[3]; - float64x2_t inv_ln2, ln2, shift, thres; + float64x2_t inv_ln2; + double ln2[2]; + float64x2_t shift, thres; uint64x2_t index_mask, special_bound; } data = { .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), @@ -58,8 +60,9 @@ exp_inline (float64x2_t x) float64x2_t n = vsubq_f64 (z, d->shift); /* r = x - n*ln2/N. */ - float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); - r = vfmaq_laneq_f64 (r, n, d->ln2, 1); + float64x2_t ln2 = vld1q_f64 (d->ln2); + float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0); + r = vfmaq_laneq_f64 (r, n, ln2, 1); uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); uint64x2_t i = vandq_u64 (u, d->index_mask); diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c index 3e70cbc025..19cbb7d0f4 100644 --- a/sysdeps/aarch64/fpu/erf_advsimd.c +++ b/sysdeps/aarch64/fpu/erf_advsimd.c @@ -56,8 +56,8 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1])); + float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf); e.erf = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c index 548f21a3d6..f1b3bfe830 100644 --- a/sysdeps/aarch64/fpu/erfc_advsimd.c +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c @@ -26,7 +26,7 @@ static const struct data float64x2_t max, shift; float64x2_t p20, p40, p41, p42; float64x2_t p51, p52; - float64x2_t qr5, qr6, qr7, qr8, qr9; + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; #if WANT_SIMD_EXCEPT float64x2_t uflow_bound; #endif @@ -68,8 +68,10 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1])); + float64x2_t e1 + = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + float64x2_t e2 + = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); e.erfc = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); /* Compute p_i using recurrence relation: p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ - float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0)); - p6 = vmulq_laneq_f64 (p6, dat->qr5, 1); - float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0)); - p7 = vmulq_laneq_f64 (p7, dat->qr6, 1); - float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0)); - p8 = vmulq_laneq_f64 (p8, dat->qr7, 1); - float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0)); - p9 = vmulq_laneq_f64 (p9, dat->qr8, 1); - float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0)); - p10 = vmulq_laneq_f64 (p10, dat->qr9, 1); + float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6), + qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8), + qr9 = vld1q_f64 (dat->qr9); + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0)); + p6 = vmulq_laneq_f64 (p6, qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0)); + p7 = vmulq_laneq_f64 (p7, qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0)); + p8 = vmulq_laneq_f64 (p8, qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0)); + p9 = vmulq_laneq_f64 (p9, qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0)); + p10 = vmulq_laneq_f64 (p10, qr9, 1); /* Compute polynomial in d using pairwise Horner scheme. */ float64x2_t p90 = vfmaq_f64 (p9, d, p10); float64x2_t p78 = vfmaq_f64 (p7, d, p8); diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c index 30b9e48dd4..ca5bc3ab33 100644 --- a/sysdeps/aarch64/fpu/erfcf_advsimd.c +++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c @@ -23,7 +23,8 @@ static const struct data { uint32x4_t offset, table_scale; float32x4_t max, shift; - float32x4_t coeffs, third, two_over_five, tenth; + float coeffs[4]; + float32x4_t third, two_over_five, tenth; #if WANT_SIMD_EXCEPT float32x4_t uflow_bound; #endif @@ -37,7 +38,7 @@ static const struct data .shift = V4 (0x1p17f), /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and fmas. */ - .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, .third = V4 (0x1.555556p-2f), .two_over_five = V4 (-0x1.99999ap-2f), .tenth = V4 (-0x1.99999ap-4f), @@ -60,12 +61,16 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); - float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); - float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); - float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + float32x2_t t1 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + float32x2_t t2 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + float32x2_t t3 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erfc = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; @@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x) float32x4_t r2 = vmulq_f32 (r, r); float32x4_t p1 = r; - float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); + float32x4_t coeffs = vld1q_f32 (dat->coeffs); + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); float32x4_t p3 - = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); - float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); p4 = vfmsq_f32 (dat->tenth, r2, p4); float32x4_t y = vfmaq_f32 (p3, d, p4); diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c index c44644a71c..f2fe6ff236 100644 --- a/sysdeps/aarch64/fpu/erff_advsimd.c +++ b/sysdeps/aarch64/fpu/erff_advsimd.c @@ -47,12 +47,12 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erff_data.tab + i[0])); - float64_t t1 = *((float64_t *) (__erff_data.tab + i[1])); - float64_t t2 = *((float64_t *) (__erff_data.tab + i[2])); - float64_t t3 = *((float64_t *) (__erff_data.tab + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erf = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c index ab117b69da..cf53e73290 100644 --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c @@ -25,7 +25,8 @@ static const struct data { float32x4_t poly[5]; - float32x4_t log10_2_and_inv, shift; + float log10_2_and_inv[4]; + float32x4_t shift; #if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; @@ -111,10 +112,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ - float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); + float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); float32x4_t n = vsubq_f32 (z, d->shift); - float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); - r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); + r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c index 3628398674..3db3b80c49 100644 --- a/sysdeps/aarch64/fpu/expm1_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c @@ -23,7 +23,9 @@ static const struct data { float64x2_t poly[11]; - float64x2_t invln2, ln2, shift; + float64x2_t invln2; + double ln2[2]; + float64x2_t shift; int64x2_t exponent_bias; #if WANT_SIMD_EXCEPT uint64x2_t thresh, tiny_bound; @@ -92,8 +94,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) where 2^i is exact because i is an integer. */ float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); int64x2_t i = vcvtq_s64_f64 (n); - float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); - f = vfmsq_laneq_f64 (f, n, d->ln2, 1); + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c index 93db200f61..a0616ec754 100644 --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c @@ -23,7 +23,7 @@ static const struct data { float32x4_t poly[5]; - float32x4_t invln2_and_ln2; + float invln2_and_ln2[4]; float32x4_t shift; int32x4_t exponent_bias; #if WANT_SIMD_EXCEPT @@ -88,11 +88,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - float32x4_t j = vsubq_f32 ( - vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); + float32x4_t j + = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); + float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: diff --git a/sysdeps/aarch64/fpu/finite_pow.h b/sysdeps/aarch64/fpu/finite_pow.h new file mode 100644 index 0000000000..84c93d4048 --- /dev/null +++ b/sysdeps/aarch64/fpu/finite_pow.h @@ -0,0 +1,373 @@ +/* Double-precision x^y function. + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "math_config.h" + +/* Scalar version of pow used for fallbacks in vector implementations. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 +#define As __v_pow_log_data.poly + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.0). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define InvLn2N __v_pow_exp_data.n_over_ln2 +#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi +#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo +#define SBits __v_pow_exp_data.sbits +#define Cs __v_pow_exp_data.poly + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double +log_inline (uint64_t ix, double *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64_t tmp = ix - Off; + int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1); + int k = (int64_t) tmp >> 52; /* arithmetic shift. */ + uint64_t iz = ix - (tmp & 0xfffULL << 52); + double z = asdouble (iz); + double kd = (double) k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + double invc = __v_pow_log_data.invc[i]; + double logc = __v_pow_log_data.logc[i]; + double logctail = __v_pow_log_data.logctail[i]; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + double r = fma (z, invc, -1.0); + + /* k*Ln2 + log(c) + r. */ + double t1 = kd * __v_pow_log_data.ln2_hi + logc; + double t2 = t1 + r; + double lo1 = kd * __v_pow_log_data.ln2_lo + logctail; + double lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double ar = As[0] * r; + double ar2 = r * ar; + double ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + double hi = t2 + ar2; + double lo3 = fma (ar, r, -ar2); + double lo4 = t2 - hi + ar2; + /* p = log1p(r) - r - A[0]*r*r. */ + double p = (ar3 + * (As[1] + r * As[2] + + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6])))); + double lo = lo1 + lo2 + lo3 + lo4 + p; + double y = hi + lo; + *tail = hi - y + lo; + return y; +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +special_case (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return y; + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + y = scale + scale * tmp; +#if WANT_SIMD_EXCEPT + if (fabs (y) < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = (hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble (sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } +#endif + y = 0x1p-1022 * y; + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline double +exp_inline (double x, double xtail, uint32_t sign_bias) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (__glibc_unlikely (abstop - SmallExp >= ThresExp)) + { + if (abstop - SmallExp >= 0x80000000) + { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return sign_bias ? -1.0 : 1.0; + } + if (abstop >= top12 (1024.0)) + { + /* Note: inf and nan are already handled. */ + /* Skip errno handling. */ +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (sign_bias) + : __math_oflow (sign_bias); +#else + double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; + return sign_bias ? -res_uoflow : res_uoflow; +#endif + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (__glibc_unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + A version of exp_inline that is not inlined and for which sign_bias is + equal to 0. */ +static double NOINLINE +exp_nosignbias (double x, double xtail) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (__glibc_unlikely (abstop - SmallExp >= ThresExp)) + { + /* Avoid spurious underflow for tiny x. */ + if (abstop - SmallExp >= 0x80000000) + return 1.0; + /* Note: inf and nan are already handled. */ + if (abstop >= top12 (1024.0)) +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); +#else + return asuint64 (x) >> 63 ? 0.0 : INFINITY; +#endif + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (__glibc_unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +static double NOINLINE +pow_scalar_special_case (double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64 (x); + iy = asuint64 (y); + topx = top12 (x); + topy = top12 (y); + if (__glibc_unlikely (topx - SmallPowX >= ThresPowX + || (topy & 0x7ff) - SmallPowY >= ThresPowY)) + { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (__glibc_unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) + || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (__glibc_unlikely (zeroinfnan (ix))) + { + double x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + { + x2 = -x2; + sign_bias = 1; + } +#if WANT_SIMD_EXCEPT + if (2 * ix == 0 && iy >> 63) + return __math_divzero (sign_bias); +#endif + return iy >> 63 ? 1 / x2 : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) + { + /* Finite x < 0. */ + int yint = checkint (iy); + if (yint == 0) +#if WANT_SIMD_EXCEPT + return __math_invalid (x); +#else + return __builtin_nan (""); +#endif + if (yint == 1) + sign_bias = SignBias; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - SmallPowY >= ThresPowY) + { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64 (1.0)) + return 1.0; + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if ((topy & 0x7ff) < SmallPowY) + return 1.0; +#if WANT_SIMD_EXCEPT + return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) + : __math_uflow (0); +#else + return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; +#endif + } + if (topx == 0) + { + /* Normalize subnormal x so exponent becomes negative. */ + ix = asuint64 (x * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double lo; + double hi = log_inline (ix, &lo); + double ehi = y * hi; + double elo = y * lo + fma (y, hi, -ehi); + return exp_inline (ehi, elo, sign_bias); +} diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c new file mode 100644 index 0000000000..e4e279fa0c --- /dev/null +++ b/sysdeps/aarch64/fpu/hypot_advsimd.c @@ -0,0 +1,97 @@ +/* Double-precision vector (Advanced SIMD) hypot function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint64x2_t tiny_bound, thres; +} data = { + .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ +}; +#else +static const struct data +{ + uint64x2_t tiny_bound; + uint32x4_t thres; +} data = { + .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, + uint32x2_t special) +{ + return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special)); +} + +/* Vector implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) + got 0x1.6a1b19400964ep-204 + want 0x1.6a1b19400964dp-204. */ +#if WANT_SIMD_EXCEPT + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t ix = vreinterpretq_u64_f64 (ax); + uint64x2_t iy = vreinterpretq_u64_f64 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); + uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f64 (ax, specialx); + ay = v_zerofy_f64 (ay, specialy); + uint32x2_t special = vaddhn_u64 (specialx, specialy); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); + + if (__glibc_unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#else + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); + + uint32x2_t special = vcge_u32 ( + vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); + + if (__glibc_unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#endif diff --git a/sysdeps/aarch64/fpu/hypot_sve.c b/sysdeps/aarch64/fpu/hypot_sve.c new file mode 100644 index 0000000000..74417040ac --- /dev/null +++ b/sysdeps/aarch64/fpu/hypot_sve.c @@ -0,0 +1,54 @@ +/* Double-precision vector (SVE) hypot function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +static const struct data +{ + uint64_t tiny_bound, thres; +} data = { + .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */ + .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */ +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330) + got 0x1.6a22d0412cfp+352 + want 0x1.6a22d0412cf01p+352. */ +svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + return svsqrt_x (pg, sqsum); +} diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c new file mode 100644 index 0000000000..34818b021a --- /dev/null +++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c @@ -0,0 +1,98 @@ +/* Single-precision vector (Advanced SIMD) hypot function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint32x4_t tiny_bound, thres; +} data = { + .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ +}; +#else +static const struct data +{ + uint32x4_t tiny_bound; + uint16x8_t thres; +} data = { + .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, + uint16x4_t special) +{ + return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); +} + +/* Vector implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 + want 0x1.6a41dp-13. */ +#if WANT_SIMD_EXCEPT + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t ix = vreinterpretq_u32_f32 (ax); + uint32x4_t iy = vreinterpretq_u32_f32 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); + uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f32 (ax, specialx); + ay = v_zerofy_f32 (ay, specialy); + uint16x4_t special = vaddhn_u32 (specialx, specialy); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); + + if (__glibc_unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#else + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); + + uint16x4_t special = vcge_u16 ( + vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); + + if (__glibc_unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#endif +libmvec_hidden_def (V_NAME_F2 (hypot)) +HALF_WIDTH_ALIAS_F2(hypot) diff --git a/sysdeps/aarch64/fpu/hypotf_sve.c b/sysdeps/aarch64/fpu/hypotf_sve.c new file mode 100644 index 0000000000..3a403de66e --- /dev/null +++ b/sysdeps/aarch64/fpu/hypotf_sve.c @@ -0,0 +1,48 @@ +/* Single-precision vector (SVE) hypot function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +#define TinyBound 0x0c800000 /* asuint (0x1p-102). */ +#define Thres 0x73000000 /* 0x70000000 - TinyBound. */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19 + want 0x1.6a2344p-19. */ +svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y, + const svbool_t pg) +{ + svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + + return svsqrt_x (pg, sqsum); +} diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c index 1e5ef99e89..c065aaebae 100644 --- a/sysdeps/aarch64/fpu/log10_advsimd.c +++ b/sysdeps/aarch64/fpu/log10_advsimd.c @@ -58,8 +58,10 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); e.invc = vuzp1q_f64 (e0, e1); diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c index a34978f6cf..4057c552d8 100644 --- a/sysdeps/aarch64/fpu/log2_advsimd.c +++ b/sysdeps/aarch64/fpu/log2_advsimd.c @@ -55,8 +55,10 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); e.invc = vuzp1q_f64 (e0, e1); diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c index 21df61728c..015a6da7d7 100644 --- a/sysdeps/aarch64/fpu/log_advsimd.c +++ b/sysdeps/aarch64/fpu/log_advsimd.c @@ -54,17 +54,12 @@ lookup (uint64x2_t i) { /* Since N is a power of 2, n % N = n & (N - 1). */ struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); -#if __BYTE_ORDER == __LITTLE_ENDIAN e.invc = vuzp1q_f64 (e0, e1); e.logc = vuzp2q_f64 (e0, e1); -#else - e.invc = vuzp1q_f64 (e1, e0); - e.logc = vuzp2q_f64 (e1, e0); -#endif return e; } diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c new file mode 100644 index 0000000000..3c91e3e183 --- /dev/null +++ b/sysdeps/aarch64/fpu/pow_advsimd.c @@ -0,0 +1,249 @@ +/* Double-precision vector (AdvSIMD) pow function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +/* Defines parameters of the approximation and scalar fallback. */ +#include "finite_pow.h" + +#define VecSmallExp v_u64 (SmallExp) +#define VecThresExp v_u64 (ThresExp) + +#define VecSmallPowX v_u64 (SmallPowX) +#define VecThresPowX v_u64 (ThresPowX) +#define VecSmallPowY v_u64 (SmallPowY) +#define VecThresPowY v_u64 (ThresPowY) + +static const struct data +{ + float64x2_t log_poly[6]; + float64x2_t exp_poly[3]; + float64x2_t ln2_hi, ln2_lo; + float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx; + uint64x2_t inf; +} data = { + /* Coefficients copied from v_pow_log_data.c + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .log_poly + = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2), + V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4), + V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) }, + .ln2_hi = V2 (0x1.62e42fefa3800p-1), + .ln2_lo = V2 (0x1.ef35793c76730p-45), + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 + (0.550 without fma) if |x| < ln2/512. */ + .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), + V2 (0x1.5555576a5adcep-5) }, + .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ + .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ + .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ + .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), + .small_powx = V2 (0x1p-126), + .inf = V2 (0x7ff0000000000000) +}; + +#define A(i) data.log_poly[i] +#define C(i) data.exp_poly[i] + +/* This version implements an algorithm close to scalar pow but + - does not implement the trick in the exp's specialcase subroutine to avoid + double-rounding, + - does not use a tail in the exponential core computation, + - and pow's exp polynomial order and table bits might differ. + + Maximum measured error is 1.04 ULPs: + _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13) + got 0x1.f71162f473251p-1 + want 0x1.f71162f473252p-1. */ + +static inline float64x2_t +v_masked_lookup_f64 (const double *table, uint64x2_t i) +{ + return (float64x2_t){ + table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)], + table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)] + }; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline float64x2_t +v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) +{ + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); + int64x2_t k + = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); + float64x2_t z = vreinterpretq_f64_u64 (iz); + float64x2_t kd = vcvtq_f64_s64 (k); + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp); + float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp); + float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp); + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); + /* k*Ln2 + log(c) + r. */ + float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); + float64x2_t t2 = vaddq_f64 (t1, r); + float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); + /* Evaluation is optimized assuming superscalar pipelined execution. */ + float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); + float64x2_t ar2 = vmulq_f64 (r, ar); + float64x2_t ar3 = vmulq_f64 (r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + float64x2_t hi = vaddq_f64 (t2, ar2); + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + float64x2_t a56 = vfmaq_f64 (A (4), r, A (5)); + float64x2_t a34 = vfmaq_f64 (A (2), r, A (3)); + float64x2_t a12 = vfmaq_f64 (A (0), r, A (1)); + float64x2_t p = vfmaq_f64 (a34, ar2, a56); + p = vfmaq_f64 (a12, ar2, p); + p = vmulq_f64 (ar3, p); + float64x2_t lo + = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p); + float64x2_t y = vaddq_f64 (hi, lo); + *tail = vaddq_f64 (vsubq_f64 (hi, y), lo); + return y; +} + +static float64x2_t VPCS_ATTR NOINLINE +exp_special_case (float64x2_t x, float64x2_t xtail) +{ + return (float64x2_t){ exp_nosignbias (x[0], xtail[0]), + exp_nosignbias (x[1], xtail[1]) }; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ +static inline float64x2_t +v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) +{ + /* Fallback to scalar exp_inline for all lanes if any lane + contains value of x s.t. |x| <= 2^-54 or >= 512. */ + uint64x2_t abstop + = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52); + uint64x2_t uoflowx + = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); + if (__glibc_unlikely (v_any_u64 (uoflowx))) + return exp_special_case (x, xtail); + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + float64x2_t kd = vaddq_f64 (z, d->shift); + uint64x2_t ki = vreinterpretq_u64_f64 (kd); + kd = vsubq_f64 (kd, d->shift); + float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); + r = vfmsq_f64 (r, kd, d->ln2_lo_n); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = vaddq_f64 (r, xtail); + /* 2^(k/N) ~= scale. */ + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64x2_t sbits = v_lookup_u64 (SBits, idx); + sbits = vaddq_u64 (sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); + tmp = vfmaq_f64 (C (0), r, tmp); + tmp = vfmaq_f64 (r, r2, tmp); + float64x2_t scale = vreinterpretq_f64_u64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return vfmaq_f64 (scale, scale, tmp); +} + +static float64x2_t NOINLINE VPCS_ATTR +scalar_fallback (float64x2_t x, float64x2_t y) +{ + return (float64x2_t){ pow_scalar_special_case (x[0], y[0]), + pow_scalar_special_case (x[1], y[1]) }; +} + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + /* Case of x <= 0 is too complicated to be vectorised efficiently here, + fallback to scalar pow for all lanes if any x < 0 detected. */ + if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) + return scalar_fallback (x, y); + + uint64x2_t vix = vreinterpretq_u64_f64 (x); + uint64x2_t viy = vreinterpretq_u64_f64 (y); + uint64x2_t iay = vandq_u64 (viy, d->inf); + + /* Special cases of x or y. */ +#if WANT_SIMD_EXCEPT + /* Small or large. */ + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vabstopy = vshrq_n_u64 (iay, 52); + uint64x2_t specialx + = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); + uint64x2_t specialy + = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); +#else + /* The case y==0 does not trigger a special case, since in this case it is + necessary to fix the result only if x is a signalling nan, which already + triggers a special case. We test y==0 directly in the scalar fallback. */ + uint64x2_t iax = vandq_u64 (vix, d->inf); + uint64x2_t specialx = vcgeq_u64 (iax, d->inf); + uint64x2_t specialy = vcgeq_u64 (iay, d->inf); +#endif + uint64x2_t special = vorrq_u64 (specialx, specialy); + /* Fallback to scalar on all lanes if any lane is inf or nan. */ + if (__glibc_unlikely (v_any_u64 (special))) + return scalar_fallback (x, y); + + /* Small cases of x: |x| < 0x1p-126. */ + uint64x2_t smallx = vcaltq_f64 (x, d->small_powx); + if (__glibc_unlikely (v_any_u64 (smallx))) + { + /* Update ix if top 12 bits of x are 0. */ + uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52)); + if (__glibc_unlikely (v_any_u64 (sub_x))) + { + /* Normalize subnormal x so exponent becomes negative. */ + uint64x2_t vix_norm = vreinterpretq_u64_f64 ( + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52))))); + vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); + vix = vbslq_u64 (sub_x, vix_norm, vix); + } + } + + /* Vector Log(ix, &lo). */ + float64x2_t vlo; + float64x2_t vhi = v_log_inline (vix, &vlo, d); + + /* Vector Exp(y_loghi, y_loglo). */ + float64x2_t vehi = vmulq_f64 (y, vhi); + float64x2_t velo = vmulq_f64 (y, vlo); + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); + velo = vsubq_f64 (velo, vemi); + return v_exp_inline (vehi, velo, d); +} diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c new file mode 100644 index 0000000000..4c0bf8956c --- /dev/null +++ b/sysdeps/aarch64/fpu/pow_sve.c @@ -0,0 +1,411 @@ +/* Double-precision vector (SVE) pow function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* This version share a similar algorithm as AOR scalar pow. + + The core computation consists in computing pow(x, y) as + + exp (y * log (x)). + + The algorithms for exp and log are very similar to scalar exp and log. + The log relies on table lookup for 3 variables and an order 8 polynomial. + It returns a high and a low contribution that are then passed to the exp, + to minimise the loss of accuracy in both routines. + The exp is based on 8-bit table lookup for scale and order-4 polynomial. + The SVE algorithm drops the tail in the exp computation at the price of + a lower accuracy, slightly above 1ULP. + The SVE algorithm also drops the special treatement of small (< 2^-65) and + large (> 2^63) finite values of |y|, as they only affect non-round to nearest + modes. + + Maximum measured error is 1.04 ULPs: + SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) + got 0x1.f7116284221fcp-1 + want 0x1.f7116284221fdp-1. */ + +#include "math_config.h" +#include "sv_math.h" + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define A __v_pow_log_data.poly +#define Off 0x3fe6955500000000 + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define C __v_pow_exp_data.poly +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define HugeExp 0x409 /* top12(1024.). */ + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Check if x is an integer. */ +static inline svbool_t +sv_isint (svbool_t pg, svfloat64_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +sv_isnotint (svbool_t pg, svfloat64_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +sv_isodd (svbool_t pg, svfloat64_t x) +{ + svfloat64_t y = svmul_x (pg, x, 0.5); + return sv_isnotint (pg, y); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Top 12 bits (sign and exponent of each double float lane). */ +static inline svuint64_t +sv_top12 (svfloat64_t x) +{ + return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint64_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + 2 * asuint64 (INFINITY) - 1); +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +specialcase (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale; + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + return 0x1p1009 * (scale + scale * tmp); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + double y = scale + scale * tmp; + return 0x1p-1022 * y; +} + +/* Scalar fallback for special cases of SVE pow's exp. */ +static inline svfloat64_t +sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double sx1 = svclastb (p, 0, x1); + uint64_t su1 = svclastb (p, 0, u1); + uint64_t su2 = svclastb (p, 0, u2); + double elem = specialcase (sx1, su1, su2); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline svfloat64_t +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), + sv_u64 (N_LOG - 1)); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t kd = svcvt_f64_x (pg, k); + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version + that uses array of structures. We also do the lookup earlier in the code to + make sure it finishes as early as possible. */ + svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); + svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); + svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + svfloat64_t r = svmad_x (pg, z, invc, -1.0); + /* k*Ln2 + log(c) + r. */ + svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + svfloat64_t t2 = svadd_x (pg, t1, r); + svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ + svfloat64_t ar2 = svmul_x (pg, r, ar); + svfloat64_t ar3 = svmul_x (pg, r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + svfloat64_t hi = svadd_x (pg, t2, ar2); + svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * + A[6])))). */ + svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); + svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); + svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + svfloat64_t p = svmla_x (pg, a34, ar2, a56); + p = svmla_x (pg, a12, ar2, p); + p = svmul_x (pg, ar3, p); + svfloat64_t lo = svadd_x ( + pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + svfloat64_t y = svadd_x (pg, hi, lo); + *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline svfloat64_t +sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias) +{ + /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) + and other cases of large values of x (scale * (1 + TMP) oflow). */ + svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff); + /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ + svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); + + /* Conditions special, uflow and oflow are all expressed as uoflow && + something, hence do not bother computing anything if no lane in uoflow is + true. */ + svbool_t special = svpfalse_b (); + svbool_t uflow = svpfalse_b (); + svbool_t oflow = svpfalse_b (); + if (__glibc_unlikely (svptest_any (pg, uoflow))) + { + /* |x| is tiny (|x| <= 0x1p-54). */ + uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + uflow = svand_z (pg, uoflow, uflow); + /* |x| is huge (|x| >= 1024). */ + oflow = svcmpge (pg, abstop, HugeExp); + oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow + or underflow. */ + special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); + svfloat64_t kd = svadd_x (pg, z, shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, shift); + svfloat64_t r = x; + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + sbits = svadd_x (pg, sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); + tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); + tmp = svmla_x (pg, r, r2, tmp); + svfloat64_t scale = svreinterpret_f64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, tmp); + + /* Update result with special and large cases. */ + if (__glibc_unlikely (svptest_any (pg, special))) + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); + svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); + svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + z = svsel (oflow, res_uoflow, z); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + z = svsel (uflow, res_spurious_uflow, z); + + return z; +} + +static inline double +pow_sc (double x, double y) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + /* Special cases: |x| or |y| is 0, inf or nan. */ + if (__glibc_unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (__glibc_unlikely (zeroinfnan (ix))) + { + double_t x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + x2 = -x2; + return (iy >> 63) ? 1 / x2 : x2; + } + return x; +} + +svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) +{ + /* This preamble handles special case conditions used in the final scalar + fallbacks. It also updates ix and sign_bias, that are used in the core + computation too, i.e., exp( y * log (x) ). */ + svuint64_t vix0 = svreinterpret_u64 (x); + svuint64_t viy0 = svreinterpret_u64 (y); + svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); + + /* Negative x cases. */ + svuint64_t sign_bit = svlsr_m (pg, vix0, 63); + svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint64_t sign_bias = sv_u64 (0); + svuint64_t vix = vix0; + svuint64_t vtopx1 = vtopx0; + if (__glibc_unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = sv_isnotint (xisneg, y); + svbool_t yisint_xisneg = sv_isint (xisneg, y); + svbool_t yisodd_xisneg = sv_isodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); + vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t special = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); + svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); + if (__glibc_unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + + svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); + vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); + vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52); + vix = svsel (topx_is_null, vix_norm, vix); + } + + /* y_hi = log(ix, &y_lo). */ + svfloat64_t vlo; + svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + + /* z = exp(y_hi, y_lo, sign_bias). */ + svfloat64_t vehi = svmul_x (pg, y, vhi); + svfloat64_t velo = svmul_x (pg, y, vlo); + svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); + velo = svsub_x (pg, velo, vemi); + svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + + /* Cases of finite y and finite negative x. */ + vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + + /* Cases of zero/inf/nan x or y. */ + if (__glibc_unlikely (svptest_any (pg, special))) + vz = sv_call2_f64 (pow_sc, x, y, vz, special); + + return vz; +} diff --git a/sysdeps/aarch64/fpu/powf_advsimd.c b/sysdeps/aarch64/fpu/powf_advsimd.c new file mode 100644 index 0000000000..8232e70436 --- /dev/null +++ b/sysdeps/aarch64/fpu/powf_advsimd.c @@ -0,0 +1,210 @@ +/* Single-precision vector (AdvSIMD) pow function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "math_config.h" +#include "v_math.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A d->log2_poly +#define C d->exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct data +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + float64x2_t log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + float64x2_t exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + V2 (-0x1.6ff5daa3b3d7cp-2 * Scale), + V2 (0x1.ec81d03c01aebp-2 * Scale), + V2 (-0x1.71547bb43f101p-1 * Scale), + V2 (0x1.7154764a815cbp0 * Scale)}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale), + V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale), + V2 (0x1.62e42ff0c52d6p-1 / Scale)}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +static inline float64x2_t +ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k, + float64x2_t invc, float64x2_t logc, float64x2_t y) +{ + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc); + float64x2_t y0 = vaddq_f64 (logc, k); + + /* Polynomial to approximate log1p(r)/ln2. */ + float64x2_t logx = vfmaq_f64 (A[1], r, A[0]); + logx = vfmaq_f64 (A[2], logx, r); + logx = vfmaq_f64 (A[3], logx, r); + logx = vfmaq_f64 (y0, logx, r); + + return vmulq_f64 (logx, y); +} + +static inline float64x2_t +log2_lookup (const struct data *d, uint32_t i) +{ + return vld1q_f64 ( + &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc); +} + +static inline uint64x1_t +exp2f_lookup (const struct data *d, uint64_t i) +{ + return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]); +} + +static inline float32x2_t +powf_core (const struct data *d, float64x2_t ylogx) +{ + /* N*x = k + r with r in [-1/2, 1/2]. */ + float64x2_t kd = vrndnq_f64 (ylogx); + int64x2_t ki = vcvtaq_s64_f64 (ylogx); + float64x2_t r = vsubq_f64 (ylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)), + exp2f_lookup (d, vgetq_lane_s64 (ki, 1))); + t = vaddq_u64 ( + t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS))); + float64x2_t s = vreinterpretq_f64_u64 (t); + float64x2_t p = vfmaq_f64 (C[1], r, C[0]); + p = vfmaq_f64 (C[2], r, p); + p = vfmaq_f64 (s, p, vmulq_f64 (s, r)); + return vcvt_f32_f64 (p); +} + +float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top)); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + /* Use double precision for each lane: split input vectors into lo and hi + halves and promote. */ + float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)), + tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)), + tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)), + tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3)); + + float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)), + iz_hi = vcvt_high_f64_f32 (iz); + + float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))), + k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k)); + + float64x2_t invc_lo = vzip1q_f64 (tab0, tab1), + invc_hi = vzip1q_f64 (tab2, tab3), + logc_lo = vzip2q_f64 (tab0, tab1), + logc_hi = vzip2q_f64 (tab2, tab3); + + float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)), + y_hi = vcvt_high_f64_f32 (y); + + float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo); + float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi); + + uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo), + vreinterpretq_u32_f64 (ylogx_hi)); + + cmp = vorrq_u32 ( + cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)), + vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) + >> 47))); + + float32x2_t p_lo = powf_core (d, ylogx_lo); + float32x2_t p_hi = powf_core (d, ylogx_hi); + + if (__glibc_unlikely (v_any_u32 (cmp))) + return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp); + return vcombine_f32 (p_lo, p_hi); +} +libmvec_hidden_def (V_NAME_F2 (pow)) +HALF_WIDTH_ALIAS_F2(pow) diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c new file mode 100644 index 0000000000..4f6a142325 --- /dev/null +++ b/sysdeps/aarch64/fpu/powf_sve.c @@ -0,0 +1,335 @@ +/* Single-precision vector (SVE) pow function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../ieee754/flt-32/math_config.h" +#include "sv_math.h" + +/* The following data is used in the SVE pow core computation + and special case detection. */ +#define Tinvc __v_powf_data.invc +#define Tlogc __v_powf_data.logc +#define Texp __v_powf_data.scale +#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) +#define Shift 0x1.8p52 +#define Norm 0x1p23f /* 0x4b000000. */ + +/* Overall ULP error bound for pow is 2.6 ulp + ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +static const struct data +{ + double log_poly[4]; + double exp_poly[3]; + float uflow_bound, oflow_bound, small_bound; + uint32_t sign_bias, sign_mask, subnormal_bias, off; +} data = { + /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of + V_POWF_EXP2_N. */ + .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3, + -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 }, + /* rel err: 1.69 * 2^-34. */ + .exp_poly = { + 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */ + 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */ + 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */ + }, + .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */ + .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */ + .small_bound = 0x1p-126f, + .off = 0x3f35d000, + .sign_bias = SignBias, + .sign_mask = 0x80000000, + .subnormal_bias = 0x0b800000, /* 23 << 23. */ +}; + +#define A(i) sv_f64 (d->log_poly[i]) +#define C(i) sv_f64 (d->exp_poly[i]) + +/* Check if x is an integer. */ +static inline svbool_t +svisint (svbool_t pg, svfloat32_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +svisnotint (svbool_t pg, svfloat32_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +svisodd (svbool_t pg, svfloat32_t x) +{ + svfloat32_t y = svmul_x (pg, x, 0.5f); + return svisnotint (pg, y); +} + +/* Check if zero, inf or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint32_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + 2u * 0x7f800000 - 1); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23 & 0xff; + if (e < 0x7f) + return 0; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + return 0; + if (iy & (1 << (0x7f + 23 - e))) + return 1; + return 2; +} + +/* Check if zero, inf or nan. */ +static inline int +zeroinfnan (uint32_t ix) +{ + return 2 * ix - 1 >= 2u * 0x7f800000 - 1; +} + +/* A scalar subroutine used to fix main power special cases. Similar to the + preamble of scalar powf except that we do not update ix and sign_bias. This + is done in the preamble of the SVE powf. */ +static inline float +powf_specialcase (float x, float y, float z) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ + if (__glibc_unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignalingf_inline (x) ? x + y : 1.0f; + if (ix == 0x3f800000) + return issignalingf_inline (y) ? x + y : 1.0f; + if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000) + return x + y; + if (2 * ix == 2 * 0x3f800000) + return 1.0f; + if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) + return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (__glibc_unlikely (zeroinfnan (ix))) + { + float_t x2 = x * x; + if (ix & 0x80000000 && checkint (iy) == 1) + x2 = -x2; + return iy & 0x80000000 ? 1 / x2 : x2; + } + /* We need a return here in case x<0 and y is integer, but all other tests + need to be run. */ + return z; +} + +/* Scalar fallback for special case routines with custom signature. */ +static inline svfloat32_t +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float sx1 = svclastb (p, 0, x1); + float sx2 = svclastb (p, 0, x2); + float elem = svclastb (p, 0, y); + elem = powf_specialcase (sx1, sx2, elem); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +/* Compute core for half of the lanes in double precision. */ +static inline svfloat64_t +sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, + svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx, + const struct data *d) +{ + svfloat64_t invc = svld1_gather_index (pg, Tinvc, i); + svfloat64_t logc = svld1_gather_index (pg, Tlogc, i); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc); + svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k)); + + /* Polynomial to approximate log1p(r)/ln2. */ + svfloat64_t logx = A (0); + logx = svmla_x (pg, A (1), r, logx); + logx = svmla_x (pg, A (2), r, logx); + logx = svmla_x (pg, A (3), r, logx); + logx = svmla_x (pg, y0, r, logx); + *pylogx = svmul_x (pg, y, logx); + + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svadd_x (pg, *pylogx, Shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, Shift); + + r = svsub_x (pg, *pylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + svuint64_t t + = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (pg, ki, sign_bias); + t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svfloat64_t s = svreinterpret_f64 (t); + + svfloat64_t p = C (0); + p = svmla_x (pg, C (1), p, r); + p = svmla_x (pg, C (2), p, r); + p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + + return p; +} + +/* Widen vector to double precision and compute core on both halves of the + vector. Lower cost of promotion by considering all lanes active. */ +static inline svfloat32_t +sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, + svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx, + const struct data *d) +{ + const svbool_t ptrue = svptrue_b64 (); + + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in + order to perform core computation in double precision. */ + const svbool_t pg_lo = svunpklo (pg); + const svbool_t pg_hi = svunpkhi (pg); + svfloat64_t y_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat32_t z = svreinterpret_f32 (iz); + svfloat64_t z_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); + svfloat64_t z_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svuint64_t i_lo = svunpklo (i); + svuint64_t i_hi = svunpkhi (i); + svint64_t k_lo = svunpklo (k); + svint64_t k_hi = svunpkhi (k); + svuint64_t sign_bias_lo = svunpklo (sign_bias); + svuint64_t sign_bias_hi = svunpkhi (sign_bias); + + /* Compute each part in double precision. */ + svfloat64_t ylogx_lo, ylogx_hi; + svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo, + sign_bias_lo, &ylogx_lo, d); + svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi, + sign_bias_hi, &ylogx_hi, d); + + /* Convert back to single-precision and interleave. */ + svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo); + svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi); + *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32); + svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo); + svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi); + return svuzp1 (lo_32, hi_32); +} + +/* Implementation of SVE powf. + Provides the same accuracy as AdvSIMD powf, since it relies on the same + algorithm. The theoretical maximum error is under 2.60 ULPs. + Maximum measured error is 2.56 ULPs: + SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 + want 0x1.fd4b06p+127. */ +svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t vix0 = svreinterpret_u32 (x); + svuint32_t viy0 = svreinterpret_u32 (y); + + /* Negative x cases. */ + svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); + svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint32_t sign_bias = sv_u32 (0); + svuint32_t vix = vix0; + if (__glibc_unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = svisnotint (xisneg, y); + svbool_t yisint_xisneg = svisint (xisneg, y); + svbool_t yisodd_xisneg = svisodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t cmp = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svbool_t xsmall = svaclt (pg, x, d->small_bound); + if (__glibc_unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); + vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff); + vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias); + vix = svsel (xsmall, vix_norm, vix); + } + /* Part of core computation carried in working precision. */ + svuint32_t tmp = svsub_x (pg, vix, d->off); + svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (pg, tmp, 0xff800000); + svuint32_t iz = svsub_x (pg, vix, top); + svint32_t k + = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results to + handle cases of underflow and underflow in exp. */ + svfloat32_t ylogx; + svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + + /* Handle exp special cases of underflow and overflow. */ + svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svfloat32_t ret_oflow + = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + svfloat32_t ret_uflow = svreinterpret_f32 (sign); + ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + + /* Cases of finite y and finite negative x. */ + ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + + if (__glibc_unlikely (svptest_any (pg, cmp))) + return sv_call_powf_sc (x, y, ret, cmp); + + return ret; +} diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c index fa3723b10c..3e3b76c502 100644 --- a/sysdeps/aarch64/fpu/sinh_advsimd.c +++ b/sysdeps/aarch64/fpu/sinh_advsimd.c @@ -22,8 +22,9 @@ static const struct data { - float64x2_t poly[11]; - float64x2_t inv_ln2, m_ln2, shift; + float64x2_t poly[11], inv_ln2; + double m_ln2[2]; + float64x2_t shift; uint64x2_t halff; int64x2_t onef; #if WANT_SIMD_EXCEPT @@ -40,7 +41,7 @@ static const struct data V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, .inv_ln2 = V2 (0x1.71547652b82fep0), - .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, + .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, .shift = V2 (0x1.8p52), .halff = V2 (0x3fe0000000000000), @@ -67,8 +68,10 @@ expm1_inline (float64x2_t x) and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0); - f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1); + + float64x2_t m_ln2 = vld1q_f64 (d->m_ln2); + float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0); + f = vfmaq_laneq_f64 (f, j, m_ln2, 1); /* Approximate expm1(f) using polynomial. */ float64x2_t f2 = vmulq_f64 (f, f); float64x2_t f4 = vmulq_f64 (f2, f2); diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c index 0459821ab2..d56a102dd1 100644 --- a/sysdeps/aarch64/fpu/tan_advsimd.c +++ b/sysdeps/aarch64/fpu/tan_advsimd.c @@ -23,7 +23,8 @@ static const struct data { float64x2_t poly[9]; - float64x2_t half_pi, two_over_pi, shift; + double half_pi[2]; + float64x2_t two_over_pi, shift; #if !WANT_SIMD_EXCEPT float64x2_t range_val; #endif @@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ float64x2_t r = x; - r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); - r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); + float64x2_t half_pi = vld1q_f64 (dat->half_pi); + r = vfmsq_laneq_f64 (r, q, half_pi, 0); + r = vfmsq_laneq_f64 (r, q, half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ r = vmulq_n_f64 (r, 0.5); diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c index 5a7489390a..705586f0c0 100644 --- a/sysdeps/aarch64/fpu/tanf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c @@ -23,7 +23,7 @@ static const struct data { float32x4_t poly[6]; - float32x4_t pi_consts; + float pi_consts[4]; float32x4_t shift; #if !WANT_SIMD_EXCEPT float32x4_t range_val; @@ -95,16 +95,17 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) #endif /* n = rint(x/(pi/2)). */ - float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); + float32x4_t pi_consts = vld1q_f32 (d->pi_consts); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3); float32x4_t n = vsubq_f32 (q, d->shift); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ float32x4_t r; - r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); - r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); - r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); + r = vfmaq_laneq_f32 (x, n, pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, pi_consts, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c index f2d8714075..8c98161662 100644 --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c @@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh) VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan) VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh) VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2) +VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt) VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos) VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh) VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf) @@ -38,10 +39,12 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp) VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10) VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2) VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1) +VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot) VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log) VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10) VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p) VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2) +VPCS_VECTOR_WRAPPER_ff (pow_advsimd, _ZGVnN2vv_pow) VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin) VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh) VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan) diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c index 37873d5e43..2583428af5 100644 --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c @@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh) SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan) SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh) SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2) +SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt) SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos) SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh) SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf) @@ -57,10 +58,12 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp) SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10) SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2) SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1) +SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot) SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log) SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10) SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p) SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2) +SVE_VECTOR_WRAPPER_ff (pow_sve, _ZGVsMxvv_pow) SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin) SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh) SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan) diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c index 08e33115b9..26679018d6 100644 --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c @@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf) VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf) VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf) VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f) +VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf) VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf) VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf) VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff) @@ -38,10 +39,12 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf) VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f) VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f) VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f) +VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf) VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf) VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f) VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf) VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f) +VPCS_VECTOR_WRAPPER_ff (powf_advsimd, _ZGVnN4vv_powf) VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf) VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf) VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf) diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c index 025daa662e..0f972b7886 100644 --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c @@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf) SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf) SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf) SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f) +SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf) SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf) SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf) SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff) @@ -57,10 +58,12 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf) SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f) SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f) SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f) +SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf) SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf) SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f) SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf) SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f) +SVE_VECTOR_WRAPPER_ff (powf_sve, _ZGVsMxvv_powf) SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf) SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf) SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf) diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h index a3b0e32f9e..08b06e0a6b 100644 --- a/sysdeps/aarch64/fpu/v_expf_inline.h +++ b/sysdeps/aarch64/fpu/v_expf_inline.h @@ -25,7 +25,8 @@ struct v_expf_data { float32x4_t poly[5]; - float32x4_t shift, invln2_and_ln2; + float32x4_t shift; + float invln2_and_ln2[4]; }; /* maxerr: 1.45358 +0.5 ulp. */ @@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ float32x4_t n, r, z; - z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); + float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); + z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0); n = vsubq_f32 (z, d->shift); - r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); - r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); + r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1); + r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2); uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h index 337ccfbfab..59b552da6b 100644 --- a/sysdeps/aarch64/fpu/v_expm1f_inline.h +++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h @@ -26,7 +26,8 @@ struct v_expm1f_data { float32x4_t poly[5]; - float32x4_t invln2_and_ln2, shift; + float invln2_and_ln2[4]; + float32x4_t shift; int32x4_t exponent_bias; }; @@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) calling routine should handle special values if required. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float32x4_t j = vsubq_f32 ( - vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); + float32x4_t j + = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); + float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses diff --git a/sysdeps/aarch64/fpu/v_pow_exp_data.c b/sysdeps/aarch64/fpu/v_pow_exp_data.c new file mode 100644 index 0000000000..8b7fb83668 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_pow_exp_data.c @@ -0,0 +1,301 @@ +/* Shared data between exp, exp2 and pow. + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "vecmath_config.h" + +#define N (1 << V_POW_EXP_TABLE_BITS) + +const struct v_pow_exp_data __v_pow_exp_data = { +// exp polynomial coefficients. +.poly = { +// abs error: 1.43*2^-58 +// ulp error: 0.549 (0.550 without fma) +// if |x| < ln2/512 +0x1.fffffffffffd4p-2, +0x1.5555571d6ef9p-3, +0x1.5555576a5adcep-5, +}, +// N/ln2 +.n_over_ln2 = 0x1.71547652b82fep0 * N, +// ln2/N +.ln2_over_n_hi = 0x1.62e42fefc0000p-9, +.ln2_over_n_lo = -0x1.c610ca86c3899p-45, +// Used for rounding to nearest integer without using intrinsics. +.shift = 0x1.8p52, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// sbits[k] = asuint64(H[k]) - (k << 52)/N +.sbits = { +0x3ff0000000000000, +0x3feffb1afa5abcbf, +0x3feff63da9fb3335, +0x3feff168143b0281, +0x3fefec9a3e778061, +0x3fefe7d42e11bbcc, +0x3fefe315e86e7f85, +0x3fefde5f72f654b1, +0x3fefd9b0d3158574, +0x3fefd50a0e3c1f89, +0x3fefd06b29ddf6de, +0x3fefcbd42b72a836, +0x3fefc74518759bc8, +0x3fefc2bdf66607e0, +0x3fefbe3ecac6f383, +0x3fefb9c79b1f3919, +0x3fefb5586cf9890f, +0x3fefb0f145e46c85, +0x3fefac922b7247f7, +0x3fefa83b23395dec, +0x3fefa3ec32d3d1a2, +0x3fef9fa55fdfa9c5, +0x3fef9b66affed31b, +0x3fef973028d7233e, +0x3fef9301d0125b51, +0x3fef8edbab5e2ab6, +0x3fef8abdc06c31cc, +0x3fef86a814f204ab, +0x3fef829aaea92de0, +0x3fef7e95934f312e, +0x3fef7a98c8a58e51, +0x3fef76a45471c3c2, +0x3fef72b83c7d517b, +0x3fef6ed48695bbc0, +0x3fef6af9388c8dea, +0x3fef672658375d2f, +0x3fef635beb6fcb75, +0x3fef5f99f8138a1c, +0x3fef5be084045cd4, +0x3fef582f95281c6b, +0x3fef54873168b9aa, +0x3fef50e75eb44027, +0x3fef4d5022fcd91d, +0x3fef49c18438ce4d, +0x3fef463b88628cd6, +0x3fef42be3578a819, +0x3fef3f49917ddc96, +0x3fef3bdda27912d1, +0x3fef387a6e756238, +0x3fef351ffb82140a, +0x3fef31ce4fb2a63f, +0x3fef2e85711ece75, +0x3fef2b4565e27cdd, +0x3fef280e341ddf29, +0x3fef24dfe1f56381, +0x3fef21ba7591bb70, +0x3fef1e9df51fdee1, +0x3fef1b8a66d10f13, +0x3fef187fd0dad990, +0x3fef157e39771b2f, +0x3fef1285a6e4030b, +0x3fef0f961f641589, +0x3fef0cafa93e2f56, +0x3fef09d24abd886b, +0x3fef06fe0a31b715, +0x3fef0432edeeb2fd, +0x3fef0170fc4cd831, +0x3feefeb83ba8ea32, +0x3feefc08b26416ff, +0x3feef96266e3fa2d, +0x3feef6c55f929ff1, +0x3feef431a2de883b, +0x3feef1a7373aa9cb, +0x3feeef26231e754a, +0x3feeecae6d05d866, +0x3feeea401b7140ef, +0x3feee7db34e59ff7, +0x3feee57fbfec6cf4, +0x3feee32dc313a8e5, +0x3feee0e544ede173, +0x3feedea64c123422, +0x3feedc70df1c5175, +0x3feeda4504ac801c, +0x3feed822c367a024, +0x3feed60a21f72e2a, +0x3feed3fb2709468a, +0x3feed1f5d950a897, +0x3feecffa3f84b9d4, +0x3feece086061892d, +0x3feecc2042a7d232, +0x3feeca41ed1d0057, +0x3feec86d668b3237, +0x3feec6a2b5c13cd0, +0x3feec4e1e192aed2, +0x3feec32af0d7d3de, +0x3feec17dea6db7d7, +0x3feebfdad5362a27, +0x3feebe41b817c114, +0x3feebcb299fddd0d, +0x3feebb2d81d8abff, +0x3feeb9b2769d2ca7, +0x3feeb8417f4531ee, +0x3feeb6daa2cf6642, +0x3feeb57de83f4eef, +0x3feeb42b569d4f82, +0x3feeb2e2f4f6ad27, +0x3feeb1a4ca5d920f, +0x3feeb070dde910d2, +0x3feeaf4736b527da, +0x3feeae27dbe2c4cf, +0x3feead12d497c7fd, +0x3feeac0827ff07cc, +0x3feeab07dd485429, +0x3feeaa11fba87a03, +0x3feea9268a5946b7, +0x3feea84590998b93, +0x3feea76f15ad2148, +0x3feea6a320dceb71, +0x3feea5e1b976dc09, +0x3feea52ae6cdf6f4, +0x3feea47eb03a5585, +0x3feea3dd1d1929fd, +0x3feea34634ccc320, +0x3feea2b9febc8fb7, +0x3feea23882552225, +0x3feea1c1c70833f6, +0x3feea155d44ca973, +0x3feea0f4b19e9538, +0x3feea09e667f3bcd, +0x3feea052fa75173e, +0x3feea012750bdabf, +0x3fee9fdcddd47645, +0x3fee9fb23c651a2f, +0x3fee9f9298593ae5, +0x3fee9f7df9519484, +0x3fee9f7466f42e87, +0x3fee9f75e8ec5f74, +0x3fee9f8286ead08a, +0x3fee9f9a48a58174, +0x3fee9fbd35d7cbfd, +0x3fee9feb564267c9, +0x3feea024b1ab6e09, +0x3feea0694fde5d3f, +0x3feea0b938ac1cf6, +0x3feea11473eb0187, +0x3feea17b0976cfdb, +0x3feea1ed0130c132, +0x3feea26a62ff86f0, +0x3feea2f336cf4e62, +0x3feea3878491c491, +0x3feea427543e1a12, +0x3feea4d2add106d9, +0x3feea589994cce13, +0x3feea64c1eb941f7, +0x3feea71a4623c7ad, +0x3feea7f4179f5b21, +0x3feea8d99b4492ed, +0x3feea9cad931a436, +0x3feeaac7d98a6699, +0x3feeabd0a478580f, +0x3feeace5422aa0db, +0x3feeae05bad61778, +0x3feeaf3216b5448c, +0x3feeb06a5e0866d9, +0x3feeb1ae99157736, +0x3feeb2fed0282c8a, +0x3feeb45b0b91ffc6, +0x3feeb5c353aa2fe2, +0x3feeb737b0cdc5e5, +0x3feeb8b82b5f98e5, +0x3feeba44cbc8520f, +0x3feebbdd9a7670b3, +0x3feebd829fde4e50, +0x3feebf33e47a22a2, +0x3feec0f170ca07ba, +0x3feec2bb4d53fe0d, +0x3feec49182a3f090, +0x3feec674194bb8d5, +0x3feec86319e32323, +0x3feeca5e8d07f29e, +0x3feecc667b5de565, +0x3feece7aed8eb8bb, +0x3feed09bec4a2d33, +0x3feed2c980460ad8, +0x3feed503b23e255d, +0x3feed74a8af46052, +0x3feed99e1330b358, +0x3feedbfe53c12e59, +0x3feede6b5579fdbf, +0x3feee0e521356eba, +0x3feee36bbfd3f37a, +0x3feee5ff3a3c2774, +0x3feee89f995ad3ad, +0x3feeeb4ce622f2ff, +0x3feeee07298db666, +0x3feef0ce6c9a8952, +0x3feef3a2b84f15fb, +0x3feef68415b749b1, +0x3feef9728de5593a, +0x3feefc6e29f1c52a, +0x3feeff76f2fb5e47, +0x3fef028cf22749e4, +0x3fef05b030a1064a, +0x3fef08e0b79a6f1f, +0x3fef0c1e904bc1d2, +0x3fef0f69c3f3a207, +0x3fef12c25bd71e09, +0x3fef16286141b33d, +0x3fef199bdd85529c, +0x3fef1d1cd9fa652c, +0x3fef20ab5fffd07a, +0x3fef244778fafb22, +0x3fef27f12e57d14b, +0x3fef2ba88988c933, +0x3fef2f6d9406e7b5, +0x3fef33405751c4db, +0x3fef3720dcef9069, +0x3fef3b0f2e6d1675, +0x3fef3f0b555dc3fa, +0x3fef43155b5bab74, +0x3fef472d4a07897c, +0x3fef4b532b08c968, +0x3fef4f87080d89f2, +0x3fef53c8eacaa1d6, +0x3fef5818dcfba487, +0x3fef5c76e862e6d3, +0x3fef60e316c98398, +0x3fef655d71ff6075, +0x3fef69e603db3285, +0x3fef6e7cd63a8315, +0x3fef7321f301b460, +0x3fef77d5641c0658, +0x3fef7c97337b9b5f, +0x3fef81676b197d17, +0x3fef864614f5a129, +0x3fef8b333b16ee12, +0x3fef902ee78b3ff6, +0x3fef953924676d76, +0x3fef9a51fbc74c83, +0x3fef9f7977cdb740, +0x3fefa4afa2a490da, +0x3fefa9f4867cca6e, +0x3fefaf482d8e67f1, +0x3fefb4aaa2188510, +0x3fefba1bee615a27, +0x3fefbf9c1cb6412a, +0x3fefc52b376bba97, +0x3fefcac948dd7274, +0x3fefd0765b6e4540, +0x3fefd632798844f8, +0x3fefdbfdad9cbe14, +0x3fefe1d802243c89, +0x3fefe7c1819e90d8, +0x3fefedba3692d514, +0x3feff3c22b8f71f1, +0x3feff9d96b2a23d9, +}, +}; diff --git a/sysdeps/aarch64/fpu/v_pow_log_data.c b/sysdeps/aarch64/fpu/v_pow_log_data.c new file mode 100644 index 0000000000..0242fff477 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_pow_log_data.c @@ -0,0 +1,186 @@ +/* Data for the log part of pow. + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "vecmath_config.h" + +#define N (1 << V_POW_LOG_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + + where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals + and z falls into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = round(0x1p43*log(c))/0x1p43 + tab[i].logctail = (double)(log(c) - logc) + + where c is chosen near the center of the subinterval such that 1/c has only + a few precision bits so z/c - 1 is exactly representible as double: + + 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + + Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < + 0x1p-97, the last few bits of logc are rounded away so k*ln2hi + logc has no + rounding error and the interval for z is selected such that near x == 1, + where log(x) + is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */ +const struct v_pow_log_data __v_pow_log_data = { + /* relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .poly = { -0x1p-1, -0x1.555555555556p-1, 0x1.0000000000006p-1, + 0x1.999999959554ep-1, -0x1.555555529a47ap-1, -0x1.2495b9b4845e9p0, + 0x1.0002b8b263fc3p0, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .invc = { 0x1.6a00000000000p+0, 0x1.6800000000000p+0, 0x1.6600000000000p+0, + 0x1.6400000000000p+0, 0x1.6200000000000p+0, 0x1.6000000000000p+0, + 0x1.5e00000000000p+0, 0x1.5c00000000000p+0, 0x1.5a00000000000p+0, + 0x1.5800000000000p+0, 0x1.5600000000000p+0, 0x1.5600000000000p+0, + 0x1.5400000000000p+0, 0x1.5200000000000p+0, 0x1.5000000000000p+0, + 0x1.4e00000000000p+0, 0x1.4c00000000000p+0, 0x1.4a00000000000p+0, + 0x1.4a00000000000p+0, 0x1.4800000000000p+0, 0x1.4600000000000p+0, + 0x1.4400000000000p+0, 0x1.4200000000000p+0, 0x1.4000000000000p+0, + 0x1.4000000000000p+0, 0x1.3e00000000000p+0, 0x1.3c00000000000p+0, + 0x1.3a00000000000p+0, 0x1.3a00000000000p+0, 0x1.3800000000000p+0, + 0x1.3600000000000p+0, 0x1.3400000000000p+0, 0x1.3400000000000p+0, + 0x1.3200000000000p+0, 0x1.3000000000000p+0, 0x1.3000000000000p+0, + 0x1.2e00000000000p+0, 0x1.2c00000000000p+0, 0x1.2c00000000000p+0, + 0x1.2a00000000000p+0, 0x1.2800000000000p+0, 0x1.2600000000000p+0, + 0x1.2600000000000p+0, 0x1.2400000000000p+0, 0x1.2400000000000p+0, + 0x1.2200000000000p+0, 0x1.2000000000000p+0, 0x1.2000000000000p+0, + 0x1.1e00000000000p+0, 0x1.1c00000000000p+0, 0x1.1c00000000000p+0, + 0x1.1a00000000000p+0, 0x1.1a00000000000p+0, 0x1.1800000000000p+0, + 0x1.1600000000000p+0, 0x1.1600000000000p+0, 0x1.1400000000000p+0, + 0x1.1400000000000p+0, 0x1.1200000000000p+0, 0x1.1000000000000p+0, + 0x1.1000000000000p+0, 0x1.0e00000000000p+0, 0x1.0e00000000000p+0, + 0x1.0c00000000000p+0, 0x1.0c00000000000p+0, 0x1.0a00000000000p+0, + 0x1.0a00000000000p+0, 0x1.0800000000000p+0, 0x1.0800000000000p+0, + 0x1.0600000000000p+0, 0x1.0400000000000p+0, 0x1.0400000000000p+0, + 0x1.0200000000000p+0, 0x1.0200000000000p+0, 0x1.0000000000000p+0, + 0x1.0000000000000p+0, 0x1.fc00000000000p-1, 0x1.f800000000000p-1, + 0x1.f400000000000p-1, 0x1.f000000000000p-1, 0x1.ec00000000000p-1, + 0x1.e800000000000p-1, 0x1.e400000000000p-1, 0x1.e200000000000p-1, + 0x1.de00000000000p-1, 0x1.da00000000000p-1, 0x1.d600000000000p-1, + 0x1.d400000000000p-1, 0x1.d000000000000p-1, 0x1.cc00000000000p-1, + 0x1.ca00000000000p-1, 0x1.c600000000000p-1, 0x1.c400000000000p-1, + 0x1.c000000000000p-1, 0x1.be00000000000p-1, 0x1.ba00000000000p-1, + 0x1.b800000000000p-1, 0x1.b400000000000p-1, 0x1.b200000000000p-1, + 0x1.ae00000000000p-1, 0x1.ac00000000000p-1, 0x1.aa00000000000p-1, + 0x1.a600000000000p-1, 0x1.a400000000000p-1, 0x1.a000000000000p-1, + 0x1.9e00000000000p-1, 0x1.9c00000000000p-1, 0x1.9a00000000000p-1, + 0x1.9600000000000p-1, 0x1.9400000000000p-1, 0x1.9200000000000p-1, + 0x1.9000000000000p-1, 0x1.8c00000000000p-1, 0x1.8a00000000000p-1, + 0x1.8800000000000p-1, 0x1.8600000000000p-1, 0x1.8400000000000p-1, + 0x1.8200000000000p-1, 0x1.7e00000000000p-1, 0x1.7c00000000000p-1, + 0x1.7a00000000000p-1, 0x1.7800000000000p-1, 0x1.7600000000000p-1, + 0x1.7400000000000p-1, 0x1.7200000000000p-1, 0x1.7000000000000p-1, + 0x1.6e00000000000p-1, 0x1.6c00000000000p-1, }, + .logc + = { -0x1.62c82f2b9c800p-2, -0x1.5d1bdbf580800p-2, -0x1.5767717455800p-2, + -0x1.51aad872df800p-2, -0x1.4be5f95777800p-2, -0x1.4618bc21c6000p-2, + -0x1.404308686a800p-2, -0x1.3a64c55694800p-2, -0x1.347dd9a988000p-2, + -0x1.2e8e2bae12000p-2, -0x1.2895a13de8800p-2, -0x1.2895a13de8800p-2, + -0x1.22941fbcf7800p-2, -0x1.1c898c1699800p-2, -0x1.1675cababa800p-2, + -0x1.1058bf9ae4800p-2, -0x1.0a324e2739000p-2, -0x1.0402594b4d000p-2, + -0x1.0402594b4d000p-2, -0x1.fb9186d5e4000p-3, -0x1.ef0adcbdc6000p-3, + -0x1.e27076e2af000p-3, -0x1.d5c216b4fc000p-3, -0x1.c8ff7c79aa000p-3, + -0x1.c8ff7c79aa000p-3, -0x1.bc286742d9000p-3, -0x1.af3c94e80c000p-3, + -0x1.a23bc1fe2b000p-3, -0x1.a23bc1fe2b000p-3, -0x1.9525a9cf45000p-3, + -0x1.87fa06520d000p-3, -0x1.7ab890210e000p-3, -0x1.7ab890210e000p-3, + -0x1.6d60fe719d000p-3, -0x1.5ff3070a79000p-3, -0x1.5ff3070a79000p-3, + -0x1.526e5e3a1b000p-3, -0x1.44d2b6ccb8000p-3, -0x1.44d2b6ccb8000p-3, + -0x1.371fc201e9000p-3, -0x1.29552f81ff000p-3, -0x1.1b72ad52f6000p-3, + -0x1.1b72ad52f6000p-3, -0x1.0d77e7cd09000p-3, -0x1.0d77e7cd09000p-3, + -0x1.fec9131dbe000p-4, -0x1.e27076e2b0000p-4, -0x1.e27076e2b0000p-4, + -0x1.c5e548f5bc000p-4, -0x1.a926d3a4ae000p-4, -0x1.a926d3a4ae000p-4, + -0x1.8c345d631a000p-4, -0x1.8c345d631a000p-4, -0x1.6f0d28ae56000p-4, + -0x1.51b073f062000p-4, -0x1.51b073f062000p-4, -0x1.341d7961be000p-4, + -0x1.341d7961be000p-4, -0x1.16536eea38000p-4, -0x1.f0a30c0118000p-5, + -0x1.f0a30c0118000p-5, -0x1.b42dd71198000p-5, -0x1.b42dd71198000p-5, + -0x1.77458f632c000p-5, -0x1.77458f632c000p-5, -0x1.39e87b9fec000p-5, + -0x1.39e87b9fec000p-5, -0x1.f829b0e780000p-6, -0x1.f829b0e780000p-6, + -0x1.7b91b07d58000p-6, -0x1.fc0a8b0fc0000p-7, -0x1.fc0a8b0fc0000p-7, + -0x1.fe02a6b100000p-8, -0x1.fe02a6b100000p-8, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, 0x1.0101575890000p-7, 0x1.0205658938000p-6, + 0x1.8492528c90000p-6, 0x1.0415d89e74000p-5, 0x1.466aed42e0000p-5, + 0x1.894aa149fc000p-5, 0x1.ccb73cdddc000p-5, 0x1.eea31c006c000p-5, + 0x1.1973bd1466000p-4, 0x1.3bdf5a7d1e000p-4, 0x1.5e95a4d97a000p-4, + 0x1.700d30aeac000p-4, 0x1.9335e5d594000p-4, 0x1.b6ac88dad6000p-4, + 0x1.c885801bc4000p-4, 0x1.ec739830a2000p-4, 0x1.fe89139dbe000p-4, + 0x1.1178e8227e000p-3, 0x1.1aa2b7e23f000p-3, 0x1.2d1610c868000p-3, + 0x1.365fcb0159000p-3, 0x1.4913d8333b000p-3, 0x1.527e5e4a1b000p-3, + 0x1.6574ebe8c1000p-3, 0x1.6f0128b757000p-3, 0x1.7898d85445000p-3, + 0x1.8beafeb390000p-3, 0x1.95a5adcf70000p-3, 0x1.a93ed3c8ae000p-3, + 0x1.b31d8575bd000p-3, 0x1.bd087383be000p-3, 0x1.c6ffbc6f01000p-3, + 0x1.db13db0d49000p-3, 0x1.e530effe71000p-3, 0x1.ef5ade4dd0000p-3, + 0x1.f991c6cb3b000p-3, 0x1.07138604d5800p-2, 0x1.0c42d67616000p-2, + 0x1.1178e8227e800p-2, 0x1.16b5ccbacf800p-2, 0x1.1bf99635a6800p-2, + 0x1.214456d0eb800p-2, 0x1.2bef07cdc9000p-2, 0x1.314f1e1d36000p-2, + 0x1.36b6776be1000p-2, 0x1.3c25277333000p-2, 0x1.419b423d5e800p-2, + 0x1.4718dc271c800p-2, 0x1.4c9e09e173000p-2, 0x1.522ae0738a000p-2, + 0x1.57bf753c8d000p-2, 0x1.5d5bddf596000p-2, }, + .logctail + = { 0x1.ab42428375680p-48, -0x1.ca508d8e0f720p-46, -0x1.362a4d5b6506dp-45, + -0x1.684e49eb067d5p-49, -0x1.41b6993293ee0p-47, 0x1.3d82f484c84ccp-46, + 0x1.c42f3ed820b3ap-50, 0x1.0b1c686519460p-45, 0x1.5594dd4c58092p-45, + 0x1.67b1e99b72bd8p-45, 0x1.5ca14b6cfb03fp-46, 0x1.5ca14b6cfb03fp-46, + -0x1.65a242853da76p-46, -0x1.fafbc68e75404p-46, 0x1.f1fc63382a8f0p-46, + -0x1.6a8c4fd055a66p-45, -0x1.c6bee7ef4030ep-47, -0x1.036b89ef42d7fp-48, + -0x1.036b89ef42d7fp-48, 0x1.d572aab993c87p-47, 0x1.b26b79c86af24p-45, + -0x1.72f4f543fff10p-46, 0x1.1ba91bbca681bp-45, 0x1.7794f689f8434p-45, + 0x1.7794f689f8434p-45, 0x1.94eb0318bb78fp-46, 0x1.a4e633fcd9066p-52, + -0x1.58c64dc46c1eap-45, -0x1.58c64dc46c1eap-45, -0x1.ad1d904c1d4e3p-45, + 0x1.bbdbf7fdbfa09p-45, 0x1.bdb9072534a58p-45, 0x1.bdb9072534a58p-45, + -0x1.0e46aa3b2e266p-46, -0x1.e9e439f105039p-46, -0x1.e9e439f105039p-46, + -0x1.0de8b90075b8fp-45, 0x1.70cc16135783cp-46, 0x1.70cc16135783cp-46, + 0x1.178864d27543ap-48, -0x1.48d301771c408p-45, -0x1.e80a41811a396p-45, + -0x1.e80a41811a396p-45, 0x1.a699688e85bf4p-47, 0x1.a699688e85bf4p-47, + -0x1.575545ca333f2p-45, 0x1.a342c2af0003cp-45, 0x1.a342c2af0003cp-45, + -0x1.d0c57585fbe06p-46, 0x1.53935e85baac8p-45, 0x1.53935e85baac8p-45, + 0x1.37c294d2f5668p-46, 0x1.37c294d2f5668p-46, -0x1.69737c93373dap-45, + 0x1.f025b61c65e57p-46, 0x1.f025b61c65e57p-46, 0x1.c5edaccf913dfp-45, + 0x1.c5edaccf913dfp-45, 0x1.47c5e768fa309p-46, 0x1.d599e83368e91p-45, + 0x1.d599e83368e91p-45, 0x1.c827ae5d6704cp-46, 0x1.c827ae5d6704cp-46, + -0x1.cfc4634f2a1eep-45, -0x1.cfc4634f2a1eep-45, 0x1.502b7f526feaap-48, + 0x1.502b7f526feaap-48, -0x1.980267c7e09e4p-45, -0x1.980267c7e09e4p-45, + -0x1.88d5493faa639p-45, -0x1.f1e7cf6d3a69cp-50, -0x1.f1e7cf6d3a69cp-50, + -0x1.9e23f0dda40e4p-46, -0x1.9e23f0dda40e4p-46, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, -0x1.0c76b999d2be8p-46, -0x1.3dc5b06e2f7d2p-45, + -0x1.aa0ba325a0c34p-45, 0x1.111c05cf1d753p-47, -0x1.c167375bdfd28p-45, + -0x1.97995d05a267dp-46, -0x1.a68f247d82807p-46, -0x1.e113e4fc93b7bp-47, + -0x1.5325d560d9e9bp-45, 0x1.cc85ea5db4ed7p-45, -0x1.c69063c5d1d1ep-45, + 0x1.c1e8da99ded32p-49, 0x1.3115c3abd47dap-45, -0x1.390802bf768e5p-46, + 0x1.646d1c65aacd3p-45, -0x1.dc068afe645e0p-45, -0x1.534d64fa10afdp-45, + 0x1.1ef78ce2d07f2p-45, 0x1.ca78e44389934p-45, 0x1.39d6ccb81b4a1p-47, + 0x1.62fa8234b7289p-51, 0x1.5837954fdb678p-45, 0x1.633e8e5697dc7p-45, + 0x1.9cf8b2c3c2e78p-46, -0x1.5118de59c21e1p-45, -0x1.c661070914305p-46, + -0x1.73d54aae92cd1p-47, 0x1.7f22858a0ff6fp-47, -0x1.8724350562169p-45, + -0x1.c358d4eace1aap-47, -0x1.d4bc4595412b6p-45, -0x1.1ec72c5962bd2p-48, + -0x1.aff2af715b035p-45, 0x1.212276041f430p-51, -0x1.a211565bb8e11p-51, + 0x1.bcbecca0cdf30p-46, 0x1.89cdb16ed4e91p-48, 0x1.7188b163ceae9p-45, + -0x1.c210e63a5f01cp-45, 0x1.b9acdf7a51681p-45, 0x1.ca6ed5147bdb7p-45, + 0x1.a87deba46baeap-47, 0x1.a9cfa4a5004f4p-45, -0x1.8e27ad3213cb8p-45, + 0x1.16ecdb0f177c8p-46, 0x1.83b54b606bd5cp-46, 0x1.8e436ec90e09dp-47, + -0x1.f27ce0967d675p-45, -0x1.e20891b0ad8a4p-45, 0x1.ebe708164c759p-45, + 0x1.fadedee5d40efp-46, -0x1.a0b2a08a465dcp-47, }, +}; diff --git a/sysdeps/aarch64/fpu/v_powf_data.c b/sysdeps/aarch64/fpu/v_powf_data.c new file mode 100644 index 0000000000..f789b84850 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_powf_data.c @@ -0,0 +1,102 @@ +/* Coefficients for single-precision SVE pow(x) function. + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include "vecmath_config.h" + +const struct v_powf_data __v_powf_data = { + .invc = { 0x1.6489890582816p+0, + 0x1.5cf19b35e3472p+0, + 0x1.55aac0e956d65p+0, + 0x1.4eb0022977e01p+0, + 0x1.47fcccda1dd1fp+0, + 0x1.418ceabab68c1p+0, + 0x1.3b5c788f1edb3p+0, + 0x1.3567de48e9c9ap+0, + 0x1.2fabc80fd19bap+0, + 0x1.2a25200ce536bp+0, + 0x1.24d108e0152e3p+0, + 0x1.1facd8ab2fbe1p+0, + 0x1.1ab614a03efdfp+0, + 0x1.15ea6d03af9ffp+0, + 0x1.1147b994bb776p+0, + 0x1.0ccbf650593aap+0, + 0x1.0875408477302p+0, + 0x1.0441d42a93328p+0, + 0x1p+0, + 0x1.f1d006c855e86p-1, + 0x1.e28c3341aa301p-1, + 0x1.d4bdf9aa64747p-1, + 0x1.c7b45a24e5803p-1, + 0x1.bb5f5eb2ed60ap-1, + 0x1.afb0bff8fe6b4p-1, + 0x1.a49badf7ab1f5p-1, + 0x1.9a14a111fc4c9p-1, + 0x1.901131f5b2fdcp-1, + 0x1.8687f73f6d865p-1, + 0x1.7d7067eb77986p-1, + 0x1.74c2c1cf97b65p-1, + 0x1.6c77f37cff2a1p-1 + }, + .logc = { -0x1.e960f97b22702p+3, + -0x1.c993406cd4db6p+3, + -0x1.aa711d9a7d0f3p+3, + -0x1.8bf37bacdce9bp+3, + -0x1.6e13b3519946ep+3, + -0x1.50cb8281e4089p+3, + -0x1.341504a237e2bp+3, + -0x1.17eaab624ffbbp+3, + -0x1.f88e708f8c853p+2, + -0x1.c24b6da113914p+2, + -0x1.8d02ee397cb1dp+2, + -0x1.58ac1223408b3p+2, + -0x1.253e6fd190e89p+2, + -0x1.e5641882c12ffp+1, + -0x1.81fea712926f7p+1, + -0x1.203e240de64a3p+1, + -0x1.8029b86a78281p0, + -0x1.85d713190fb9p-1, + 0x0p+0, + 0x1.4c1cc07312997p0, + 0x1.5e1848ccec948p+1, + 0x1.04cfcb7f1196fp+2, + 0x1.582813d463c21p+2, + 0x1.a936fa68760ccp+2, + 0x1.f81bc31d6cc4ep+2, + 0x1.2279a09fae6b1p+3, + 0x1.47ec0b6df5526p+3, + 0x1.6c71762280f1p+3, + 0x1.90155070798dap+3, + 0x1.b2e23b1d3068cp+3, + 0x1.d4e21b0daa86ap+3, + 0x1.f61e2a2f67f3fp+3 + }, + .scale = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, +}; diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h index c8cfc03bc0..7f0a8aa5f2 100644 --- a/sysdeps/aarch64/fpu/vecmath_config.h +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -35,17 +35,6 @@ __ptr; \ }) -static inline uint64_t -asuint64 (double f) -{ - union - { - double f; - uint64_t i; - } u = { f }; - return u.i; -} - #define V_LOG_POLY_ORDER 6 #define V_LOG_TABLE_BITS 7 extern const struct v_log_data @@ -130,4 +119,35 @@ extern const struct erfcf_data } tab[645]; } __erfcf_data attribute_hidden; +/* Some data for AdvSIMD and SVE pow's internal exp and log. */ +#define V_POW_EXP_TABLE_BITS 8 +extern const struct v_pow_exp_data +{ + double poly[3]; + double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift; + uint64_t sbits[1 << V_POW_EXP_TABLE_BITS]; +} __v_pow_exp_data attribute_hidden; + +#define V_POW_LOG_TABLE_BITS 7 +extern const struct v_pow_log_data +{ + double poly[7]; /* First coefficient is 1. */ + double ln2_hi, ln2_lo; + double invc[1 << V_POW_LOG_TABLE_BITS]; + double logc[1 << V_POW_LOG_TABLE_BITS]; + double logctail[1 << V_POW_LOG_TABLE_BITS]; +} __v_pow_log_data attribute_hidden; + +/* Some data for SVE powf's internal exp and log. */ +#define V_POWF_EXP2_TABLE_BITS 5 +#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS) +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS) +extern const struct v_powf_data +{ + double invc[V_POWF_LOG2_N]; + double logc[V_POWF_LOG2_N]; + uint64_t scale[V_POWF_EXP2_N]; +} __v_powf_data attribute_hidden; + #endif diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps index 116a5404f5..656d4b0169 100644 --- a/sysdeps/aarch64/libm-test-ulps +++ b/sysdeps/aarch64/libm-test-ulps @@ -477,11 +477,19 @@ double: 4 float: 1 ldouble: 1 +Function: "cbrt_advsimd": +double: 1 +float: 1 + Function: "cbrt_downward": double: 4 float: 1 ldouble: 1 +Function: "cbrt_sve": +double: 1 +float: 1 + Function: "cbrt_towardzero": double: 3 float: 1 @@ -1174,10 +1182,18 @@ double: 1 float: 1 ldouble: 1 +Function: "hypot_advsimd": +double: 1 +float: 1 + Function: "hypot_downward": double: 1 ldouble: 1 +Function: "hypot_sve": +double: 1 +float: 1 + Function: "hypot_towardzero": double: 1 ldouble: 1 @@ -1355,6 +1371,26 @@ double: 3 float: 3 ldouble: 1 +Function: "log2p1": +double: 1 +float: 1 +ldouble: 3 + +Function: "log2p1_downward": +double: 2 +float: 2 +ldouble: 3 + +Function: "log2p1_towardzero": +double: 2 +float: 2 +ldouble: 2 + +Function: "log2p1_upward": +double: 1 +float: 2 +ldouble: 2 + Function: "log_advsimd": double: 1 float: 3 @@ -1381,11 +1417,19 @@ double: 1 float: 1 ldouble: 2 +Function: "pow_advsimd": +double: 1 +float: 2 + Function: "pow_downward": double: 1 float: 1 ldouble: 2 +Function: "pow_sve": +double: 1 +float: 2 + Function: "pow_towardzero": double: 1 float: 1 diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S index 0ed5811745..8d554275ba 100644 --- a/sysdeps/aarch64/multiarch/memchr_generic.S +++ b/sysdeps/aarch64/multiarch/memchr_generic.S @@ -24,6 +24,9 @@ # undef libc_hidden_builtin_def # define libc_hidden_builtin_def(name) +# undef weak_alias +# define weak_alias(a, b) + /* Add a hidden definition for use within libc.so. */ # ifdef SHARED .globl __GI_memchr; __GI_memchr = __memchr_generic diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S index 81748bdbce..e125a5ed85 100644 --- a/sysdeps/aarch64/multiarch/memset_generic.S +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -33,3 +33,7 @@ #endif #include <../memset.S> + +#if IS_IN (rtld) +strong_alias (memset, __memset_generic) +#endif diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S index f980a9a68f..ceeafe920c 100644 --- a/sysdeps/aarch64/multiarch/strlen_generic.S +++ b/sysdeps/aarch64/multiarch/strlen_generic.S @@ -30,6 +30,9 @@ # undef libc_hidden_builtin_def # define libc_hidden_builtin_def(name) +# undef weak_alias +# define weak_alias(a, b) + # ifdef SHARED /* It doesn't make sense to send libc-internal strlen calls through a PLT. */ .globl __GI_strlen; __GI_strlen = __strlen_generic diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps index 5720e73288..fc7ba1439a 100644 --- a/sysdeps/arm/libm-test-ulps +++ b/sysdeps/arm/libm-test-ulps @@ -965,6 +965,22 @@ Function: "log2_upward": double: 3 float: 3 +Function: "log2p1": +double: 1 +float: 1 + +Function: "log2p1_downward": +double: 2 +float: 2 + +Function: "log2p1_towardzero": +double: 2 +float: 2 + +Function: "log2p1_upward": +double: 1 +float: 2 + Function: "log_downward": float: 2 diff --git a/sysdeps/i386/fpu/w_exp10_compat.c b/sysdeps/i386/fpu/w_exp10_compat.c index b53455386e..49a0e03385 100644 --- a/sysdeps/i386/fpu/w_exp10_compat.c +++ b/sysdeps/i386/fpu/w_exp10_compat.c @@ -1,3 +1,8 @@ /* i386 provides an optimized __ieee754_exp10. */ -#define NO_COMPAT_NEEDED 1 -#include <math/w_exp10_compat.c> +#ifdef SHARED +# define NO_COMPAT_NEEDED 1 +# include <math/w_exp10_compat.c> +#else +# include <math-type-macros-double.h> +# include <w_exp10_template.c> +#endif diff --git a/sysdeps/i386/fpu/w_fmod_compat.c b/sysdeps/i386/fpu/w_fmod_compat.c index 5ac9995ffd..528bfc2a13 100644 --- a/sysdeps/i386/fpu/w_fmod_compat.c +++ b/sysdeps/i386/fpu/w_fmod_compat.c @@ -7,8 +7,9 @@ # define LIBM_SVID_COMPAT 1 # undef compat_symbol # define compat_symbol(a, b, c, d) -#endif -#include <math/w_fmod_compat.c> -#ifdef SHARED +# include <math/w_fmod_compat.c> libm_alias_double (__fmod_compat, fmod) +#else +#include <math-type-macros-double.h> +#include <w_fmod_template.c> #endif diff --git a/sysdeps/i386/fpu/w_fmodf_compat.c b/sysdeps/i386/fpu/w_fmodf_compat.c index cc417e07d3..5a61693e51 100644 --- a/sysdeps/i386/fpu/w_fmodf_compat.c +++ b/sysdeps/i386/fpu/w_fmodf_compat.c @@ -7,8 +7,9 @@ # define LIBM_SVID_COMPAT 1 # undef compat_symbol # define compat_symbol(a, b, c, d) -#endif -#include <math/w_fmodf_compat.c> -#ifdef SHARED +# include <math/w_fmodf_compat.c> libm_alias_float (__fmod_compat, fmod) +#else +#include <math-type-macros-float.h> +#include <w_fmod_template.c> #endif diff --git a/sysdeps/i386/i586/memcpy.S b/sysdeps/i386/i586/memcpy.S index 3e26f112d6..79856d498a 100644 --- a/sysdeps/i386/i586/memcpy.S +++ b/sysdeps/i386/i586/memcpy.S @@ -26,7 +26,7 @@ #define LEN SRC+4 .text -#if defined PIC && IS_IN (libc) +#if defined SHARED && IS_IN (libc) ENTRY (__memcpy_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps index da6602f909..9eb3088d7a 100644 --- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps +++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps @@ -1424,6 +1424,30 @@ float: 1 float128: 1 ldouble: 1 +Function: "log2p1": +double: 1 +float: 1 +float128: 3 +ldouble: 2 + +Function: "log2p1_downward": +double: 1 +float: 1 +float128: 3 +ldouble: 4 + +Function: "log2p1_towardzero": +double: 1 +float: 1 +float128: 2 +ldouble: 4 + +Function: "log2p1_upward": +double: 1 +float: 1 +float128: 2 +ldouble: 5 + Function: "log_downward": double: 1 float128: 1 diff --git a/sysdeps/i386/i686/memmove.S b/sysdeps/i386/i686/memmove.S index f230359ad6..effd958120 100644 --- a/sysdeps/i386/i686/memmove.S +++ b/sysdeps/i386/i686/memmove.S @@ -29,7 +29,7 @@ #define SRC DEST+4 #define LEN SRC+4 -#if defined PIC && IS_IN (libc) +#if defined SHARED && IS_IN (libc) ENTRY_CHK (__memmove_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) diff --git a/sysdeps/i386/i686/memset.S b/sysdeps/i386/i686/memset.S index f02f5a6df7..ab06771ea0 100644 --- a/sysdeps/i386/i686/memset.S +++ b/sysdeps/i386/i686/memset.S @@ -27,7 +27,7 @@ #define LEN CHR+4 .text -#if defined PIC && IS_IN (libc) +#if defined SHARED && IS_IN (libc) ENTRY_CHK (__memset_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S index 895fee8a1e..74130bc71d 100644 --- a/sysdeps/i386/stpncpy.S +++ b/sysdeps/i386/stpncpy.S @@ -139,4 +139,6 @@ L(9): popl %esi /* restore saved register content */ END (__stpncpy) libc_hidden_def (__stpncpy) +#ifndef __stpncpy weak_alias (__stpncpy, stpncpy) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c index 225fc74c4c..7ea8270063 100644 --- a/sysdeps/ieee754/dbl-64/e_exp10.c +++ b/sysdeps/ieee754/dbl-64/e_exp10.c @@ -38,7 +38,7 @@ special_case (uint64_t sbits, double_t tmp, uint64_t ki) { double_t scale, y; - if (ki - (1ull << 16) < 0x80000000) + if ((ki & 0x80000000) == 0) { /* The exponent of scale might have overflowed by 1. */ sbits -= 1ull << 52; @@ -100,14 +100,14 @@ __exp10 (double x) /* Reduce x: z = x * N / log10(2), k = round(z). */ double_t z = __exp_data.invlog10_2N * x; double_t kd; - int64_t ki; + uint64_t ki; #if TOINT_INTRINSICS kd = roundtoint (z); ki = converttoint (z); #else kd = math_narrow_eval (z + Shift); + ki = asuint64 (kd); kd -= Shift; - ki = kd; #endif /* r = x - k * log10(2), r in [-0.5, 0.5]. */ diff --git a/sysdeps/ieee754/dbl-64/s_nearbyint.c b/sysdeps/ieee754/dbl-64/s_nearbyint.c index 08fe517b83..606002d680 100644 --- a/sysdeps/ieee754/dbl-64/s_nearbyint.c +++ b/sysdeps/ieee754/dbl-64/s_nearbyint.c @@ -72,4 +72,6 @@ __nearbyint (double x) return t; #endif /* ! USE_NEARBYINT_BUILTIN */ } +#ifndef __nearbyint libm_alias_double (__nearbyint, nearbyint) +#endif diff --git a/sysdeps/ieee754/float128/s_isnanf128.c b/sysdeps/ieee754/float128/s_isnanf128.c index 59f71533ce..291d4427f5 100644 --- a/sysdeps/ieee754/float128/s_isnanf128.c +++ b/sysdeps/ieee754/float128/s_isnanf128.c @@ -11,8 +11,14 @@ #include "../ldbl-128/s_isnanl.c" #if !IS_IN (libm) #include <float128-abi.h> +#ifdef SHARED hidden_ver (__isnanf128_impl, __isnanf128) +#else +strong_alias (__isnanf128_impl, __isnanf128) +#endif +#ifndef isnanl _weak_alias (__isnanf128_impl, isnanl) +#endif versioned_symbol (libc, __isnanf128_impl, __isnanf128, GLIBC_2_34); #if (SHLIB_COMPAT (libc, FLOAT128_VERSION_M, GLIBC_2_34)) strong_alias (__isnanf128_impl, __isnanf128_alias) diff --git a/sysdeps/ieee754/flt-32/s_nearbyintf.c b/sysdeps/ieee754/flt-32/s_nearbyintf.c index b270749f4c..5dd0e8f8af 100644 --- a/sysdeps/ieee754/flt-32/s_nearbyintf.c +++ b/sysdeps/ieee754/flt-32/s_nearbyintf.c @@ -68,4 +68,6 @@ __nearbyintf (float x) return t; #endif /* ! USE_NEARBYINT_BUILTIN */ } +#ifndef __nearbyintf libm_alias_float (__nearbyint, nearbyint) +#endif diff --git a/sysdeps/ieee754/ldbl-128/s_finitel.c b/sysdeps/ieee754/ldbl-128/s_finitel.c index bfdd88fbbb..f1fff0ced2 100644 --- a/sysdeps/ieee754/ldbl-128/s_finitel.c +++ b/sysdeps/ieee754/ldbl-128/s_finitel.c @@ -32,4 +32,6 @@ int __finitel(_Float128 x) -0x7fff000000000000LL)>>63); } mathx_hidden_def (__finitel) +#ifndef finitel weak_alias (__finitel, finitel) +#endif diff --git a/sysdeps/ieee754/ldbl-128/s_isinfl.c b/sysdeps/ieee754/ldbl-128/s_isinfl.c index baa9a98e80..a0fa0ba659 100644 --- a/sysdeps/ieee754/ldbl-128/s_isinfl.c +++ b/sysdeps/ieee754/ldbl-128/s_isinfl.c @@ -24,4 +24,6 @@ __isinfl (_Float128 x) return ~(lx >> 63) & (hx >> 62); } mathx_hidden_def (__isinfl) +#ifndef isinfl weak_alias (__isinfl, isinfl) +#endif diff --git a/sysdeps/ieee754/ldbl-128/s_nextafterl.c b/sysdeps/ieee754/ldbl-128/s_nextafterl.c index 90e71e3907..f3f3b138dd 100644 --- a/sysdeps/ieee754/ldbl-128/s_nextafterl.c +++ b/sysdeps/ieee754/ldbl-128/s_nextafterl.c @@ -83,5 +83,9 @@ _Float128 __nextafterl(_Float128 x, _Float128 y) return x; } libm_alias_ldouble (__nextafter, nextafter) +#ifndef __nexttowardl strong_alias (__nextafterl, __nexttowardl) +#endif +#ifndef nexttowardl weak_alias (__nextafterl, nexttowardl) +#endif diff --git a/sysdeps/ieee754/ldbl-128ibm-compat/Versions b/sysdeps/ieee754/ldbl-128ibm-compat/Versions index c0679a1153..c9a74366fd 100644 --- a/sysdeps/ieee754/ldbl-128ibm-compat/Versions +++ b/sysdeps/ieee754/ldbl-128ibm-compat/Versions @@ -137,6 +137,9 @@ libm { __fminimum_magieee128; __fminimum_mag_numieee128; } + GLIBC_2.40 { + __log2p1ieee128; + } } libc { LDBL_IBM128_VERSION { diff --git a/sysdeps/ieee754/ldbl-64-128/s_copysignl.c b/sysdeps/ieee754/ldbl-64-128/s_copysignl.c index 11b42d04ba..80137847d3 100644 --- a/sysdeps/ieee754/ldbl-64-128/s_copysignl.c +++ b/sysdeps/ieee754/ldbl-64-128/s_copysignl.c @@ -1,10 +1,10 @@ #include <math_ldbl_opt.h> #include <libm-alias-ldouble.h> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED # undef libm_alias_ldouble # define libm_alias_ldouble(from, to) #endif #include <sysdeps/ieee754/ldbl-128/s_copysignl.c> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED long_double_symbol (libc, __copysignl, copysignl); #endif diff --git a/sysdeps/ieee754/ldbl-64-128/s_frexpl.c b/sysdeps/ieee754/ldbl-64-128/s_frexpl.c index 73ac41e40c..f5f7d349f7 100644 --- a/sysdeps/ieee754/ldbl-64-128/s_frexpl.c +++ b/sysdeps/ieee754/ldbl-64-128/s_frexpl.c @@ -1,10 +1,10 @@ #include <math_ldbl_opt.h> #include <libm-alias-ldouble.h> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED # undef libm_alias_ldouble # define libm_alias_ldouble(from, to) #endif #include <sysdeps/ieee754/ldbl-128/s_frexpl.c> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED long_double_symbol (libc, __frexpl, frexpl); #endif diff --git a/sysdeps/ieee754/ldbl-64-128/s_modfl.c b/sysdeps/ieee754/ldbl-64-128/s_modfl.c index 7d7aeae111..ba3d31334a 100644 --- a/sysdeps/ieee754/ldbl-64-128/s_modfl.c +++ b/sysdeps/ieee754/ldbl-64-128/s_modfl.c @@ -1,10 +1,10 @@ #include <math_ldbl_opt.h> #include <libm-alias-ldouble.h> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED # undef libm_alias_ldouble # define libm_alias_ldouble(from, to) #endif #include <sysdeps/ieee754/ldbl-128/s_modfl.c> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED long_double_symbol (libc, __modfl, modfl); #endif diff --git a/sysdeps/ieee754/ldbl-opt/Makefile b/sysdeps/ieee754/ldbl-opt/Makefile index 74788ac999..d1eda5d022 100644 --- a/sysdeps/ieee754/ldbl-opt/Makefile +++ b/sysdeps/ieee754/ldbl-opt/Makefile @@ -11,47 +11,239 @@ libm-routines += s_nexttowardfd routines += nldbl-compat extra-libs += libnldbl -libnldbl-calls = asprintf dprintf fprintf fscanf fwprintf fwscanf iovfscanf \ - obstack_printf obstack_vprintf printf scanf snprintf \ - sprintf sscanf swprintf swscanf vasprintf vdprintf vfprintf \ - vfscanf vfwprintf vfwscanf vprintf vscanf vsnprintf \ - vsprintf vsscanf vswprintf vswscanf vwprintf vwscanf \ - wprintf wscanf printf_fp printf_size \ - fprintf_chk fwprintf_chk printf_chk snprintf_chk sprintf_chk \ - swprintf_chk vfprintf_chk vfwprintf_chk vprintf_chk \ - vsnprintf_chk vsprintf_chk vswprintf_chk vwprintf_chk \ - wprintf_chk asprintf_chk vasprintf_chk dprintf_chk \ - vdprintf_chk obstack_printf_chk obstack_vprintf_chk \ - syslog syslog_chk vsyslog vsyslog_chk \ - strfmon strfmon_l \ - strfroml \ - strtold strtold_l strtoldint wcstold wcstold_l wcstoldint \ - qecvt qfcvt qgcvt qecvt_r qfcvt_r \ - isinf isnan finite signbit scalb log2 lgamma_r ceil \ - significand acos asin atan atan2 cos sin tan cosh sinh \ - tanh acosh asinh atanh exp log log10 exp10 expm1 \ - log1p logb exp2 sqrt cbrt fabs floor j0 j1 y0 y1 erf erfc \ - lgamma tgamma gamma rint nearbyint round trunc \ - copysign fdim fmax fmin nextafter pow hypot fmod \ - remainder ldexp scalbn frexp modf scalbln fma nan sincos \ - jn yn ilogb remquo lrint lround llrint llround nexttowardf \ - nexttoward conj cacos cacosh casin catan catanh ccos ccosh \ - casinh cexp clog cproj csin csinh csqrt ctan ctanh cpow \ - cabs carg cimag creal clog10 \ - isoc99_scanf isoc99_fscanf isoc99_sscanf \ - isoc99_vscanf isoc99_vfscanf isoc99_vsscanf \ - isoc99_wscanf isoc99_fwscanf isoc99_swscanf \ - isoc99_vwscanf isoc99_vfwscanf isoc99_vswscanf \ - isoc23_scanf isoc23_fscanf isoc23_sscanf \ - isoc23_vscanf isoc23_vfscanf isoc23_vsscanf \ - isoc23_wscanf isoc23_fwscanf isoc23_swscanf \ - isoc23_vwscanf isoc23_vfwscanf isoc23_vswscanf \ - nextup nextdown totalorder totalordermag getpayload \ - canonicalize setpayload setpayloadsig llogb fmaxmag fminmag \ - roundeven fromfp ufromfp fromfpx ufromfpx fadd dadd \ - fdiv ddiv ffma dfma fmul dmul fsqrt dsqrt fsub dsub \ - fmaximum fmaximum_mag fmaximum_num fmaximum_mag_num \ - fminimum fminimum_mag fminimum_num fminimum_mag_num +libnldbl-calls = \ + acos \ + acosh \ + asin \ + asinh \ + asprintf \ + asprintf_chk \ + atan \ + atan2 \ + atanh \ + cabs \ + cacos \ + cacosh \ + canonicalize \ + carg \ + casin \ + casinh \ + catan \ + catanh \ + cbrt \ + ccos \ + ccosh \ + ceil \ + cexp \ + cimag \ + clog \ + clog10 \ + conj \ + copysign \ + cos \ + cosh \ + cpow \ + cproj \ + creal \ + csin \ + csinh \ + csqrt \ + ctan \ + ctanh \ + dadd \ + ddiv \ + dfma \ + dmul \ + dprintf \ + dprintf_chk \ + dsqrt \ + dsub \ + erf \ + erfc \ + exp \ + exp2 \ + exp10 \ + expm1 \ + fabs \ + fadd \ + fdim \ + fdiv \ + ffma \ + finite \ + floor \ + fma \ + fmax \ + fmaximum \ + fmaximum_mag \ + fmaximum_mag_num \ + fmaximum_num \ + fmaxmag \ + fmin \ + fminimum \ + fminimum_mag \ + fminimum_mag_num \ + fminimum_num \ + fminmag \ + fmod \ + fmul \ + fprintf \ + fprintf_chk \ + frexp \ + fromfp \ + fromfpx \ + fscanf \ + fsqrt \ + fsub \ + fwprintf \ + fwprintf_chk \ + fwscanf \ + gamma \ + getpayload \ + hypot \ + ilogb \ + iovfscanf \ + isinf \ + isnan \ + isoc23_fscanf \ + isoc23_fwscanf \ + isoc23_scanf \ + isoc23_sscanf \ + isoc23_swscanf \ + isoc23_vfscanf \ + isoc23_vfwscanf \ + isoc23_vscanf \ + isoc23_vsscanf \ + isoc23_vswscanf \ + isoc23_vwscanf \ + isoc23_wscanf \ + isoc99_fscanf \ + isoc99_fwscanf \ + isoc99_scanf \ + isoc99_sscanf \ + isoc99_swscanf \ + isoc99_vfscanf \ + isoc99_vfwscanf \ + isoc99_vscanf \ + isoc99_vsscanf \ + isoc99_vswscanf \ + isoc99_vwscanf \ + isoc99_wscanf \ + j0 \ + j1 \ + jn \ + ldexp \ + lgamma \ + lgamma_r \ + llogb \ + llrint \ + llround \ + log \ + log2 \ + log10 \ + log1p \ + log2p1 \ + logb \ + lrint \ + lround \ + modf \ + nan \ + nearbyint \ + nextafter \ + nextdown \ + nexttoward \ + nexttowardf \ + nextup \ + obstack_printf \ + obstack_printf_chk \ + obstack_vprintf \ + obstack_vprintf_chk \ + pow \ + printf \ + printf_chk \ + printf_fp \ + printf_size \ + qecvt \ + qecvt_r \ + qfcvt \ + qfcvt_r \ + qgcvt \ + remainder \ + remquo \ + rint \ + round \ + roundeven \ + scalb \ + scalbln \ + scalbn \ + scanf \ + setpayload \ + setpayloadsig \ + signbit \ + significand \ + sin \ + sincos \ + sinh \ + snprintf \ + snprintf_chk \ + sprintf \ + sprintf_chk \ + sqrt \ + sscanf \ + strfmon \ + strfmon_l \ + strfroml \ + strtold \ + strtold_l \ + strtoldint \ + swprintf \ + swprintf_chk \ + swscanf \ + syslog \ + syslog_chk \ + tan \ + tanh \ + tgamma \ + totalorder \ + totalordermag \ + trunc \ + ufromfp \ + ufromfpx \ + vasprintf \ + vasprintf_chk \ + vdprintf \ + vdprintf_chk \ + vfprintf \ + vfprintf_chk \ + vfscanf \ + vfwprintf \ + vfwprintf_chk \ + vfwscanf \ + vprintf \ + vprintf_chk \ + vscanf \ + vsnprintf \ + vsnprintf_chk \ + vsprintf \ + vsprintf_chk \ + vsscanf \ + vswprintf \ + vswprintf_chk \ + vswscanf \ + vsyslog \ + vsyslog_chk \ + vwprintf \ + vwprintf_chk \ + vwscanf \ + wcstold \ + wcstold_l \ + wcstoldint \ + wprintf \ + wprintf_chk \ + wscanf \ + y0 \ + y1 \ + yn \ +# libnldbl-calls libnldbl-routines = $(libnldbl-calls:%=nldbl-%) libnldbl-inhibit-o = $(object-suffixes) libnldbl-static-only-routines = $(libnldbl-routines) @@ -151,6 +343,7 @@ CFLAGS-nldbl-log.c = -fno-builtin-logl CFLAGS-nldbl-log10.c = -fno-builtin-log10l CFLAGS-nldbl-log1p.c = -fno-builtin-log1pl CFLAGS-nldbl-log2.c = -fno-builtin-log2l +CFLAGS-nldbl-log2p1.c = -fno-builtin-log2p1l CFLAGS-nldbl-logb.c = -fno-builtin-logbl CFLAGS-nldbl-lrint.c = -fno-builtin-lrintl CFLAGS-nldbl-lround.c = -fno-builtin-lroundl @@ -190,8 +383,11 @@ CFLAGS-nldbl-y0.c = -fno-builtin-y0l CFLAGS-nldbl-y1.c = -fno-builtin-y1l CFLAGS-nldbl-yn.c = -fno-builtin-ynl -tests += test-narrow-macros-ldbl-64 test-nldbl-redirect \ - test-redirection-ldbl-64 +tests += \ + test-narrow-macros-ldbl-64 \ + test-nldbl-redirect \ + test-redirection-ldbl-64 \ +# tests CFLAGS-test-narrow-macros-ldbl-64.c += -mlong-double-64 CFLAGS-test-nldbl-redirect.c += -mlong-double-64 CFLAGS-test-redirection-ldbl-64.c += -mlong-double-64 @@ -231,10 +427,12 @@ CFLAGS-tst-nldbl-errorfptr.c += -mlong-double-64 endif ifeq ($(subdir), stdio-common) -tests += tst-nldbl-scanf-binary-c11 \ - tst-nldbl-scanf-binary-c23 \ - tst-nldbl-scanf-binary-gnu11 \ - tst-nldbl-scanf-binary-gnu89 +tests += \ + tst-nldbl-scanf-binary-c11 \ + tst-nldbl-scanf-binary-c23 \ + tst-nldbl-scanf-binary-gnu11 \ + tst-nldbl-scanf-binary-gnu89 \ +# tests # Some versions of GCC supported for building glibc do not support -std=c23 # (added in GCC 14), or the older name -std=c2x (added in GCC 9), so @@ -252,10 +450,12 @@ CFLAGS-tst-nldbl-scanf-binary-gnu89.c += -mlong-double-64 -std=gnu89 \ endif ifeq ($(subdir), wcsmbs) -tests += tst-nldbl-wscanf-binary-c11 \ - tst-nldbl-wscanf-binary-c23 \ - tst-nldbl-wscanf-binary-gnu11 \ - tst-nldbl-wscanf-binary-gnu89 +tests += \ + tst-nldbl-wscanf-binary-c11 \ + tst-nldbl-wscanf-binary-c23 \ + tst-nldbl-wscanf-binary-gnu11 \ + tst-nldbl-wscanf-binary-gnu89 \ +# tests # Some versions of GCC supported for building glibc do not support -std=c23 # (added in GCC 14), or the older name -std=c2x (added in GCC 9), so diff --git a/sysdeps/ieee754/ldbl-opt/nldbl-log2p1.c b/sysdeps/ieee754/ldbl-opt/nldbl-log2p1.c new file mode 100644 index 0000000000..989c69e3d7 --- /dev/null +++ b/sysdeps/ieee754/ldbl-opt/nldbl-log2p1.c @@ -0,0 +1,8 @@ +#include "nldbl-compat.h" + +double +attribute_hidden +log2p1l (double x) +{ + return log2p1 (x); +} diff --git a/sysdeps/ieee754/ldbl-opt/s_ldexpl.c b/sysdeps/ieee754/ldbl-opt/s_ldexpl.c index 1afbe7d8ad..932cc4341c 100644 --- a/sysdeps/ieee754/ldbl-opt/s_ldexpl.c +++ b/sysdeps/ieee754/ldbl-opt/s_ldexpl.c @@ -17,13 +17,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED # define declare_mgen_alias(f,t) #endif #include <math-type-macros-ldouble.h> #include <s_ldexp_template.c> -#if IS_IN (libc) +#if IS_IN (libc) && defined SHARED long_double_symbol (libc, __ldexpl, ldexpl); long_double_symbol (libc, __wrap_scalbnl, scalbnl); #endif diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile index 446bda6563..b00c090faa 100644 --- a/sysdeps/loongarch/Makefile +++ b/sysdeps/loongarch/Makefile @@ -1,5 +1,7 @@ ifeq ($(subdir),misc) -sysdep_headers += sys/asm.h +sysdep_headers += \ + sys/asm.h \ + # sysdep_headers tests += \ tst-hwcap-tunables \ @@ -9,21 +11,47 @@ tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd) endif ifeq ($(subdir),elf) -gen-as-const-headers += dl-link.sym +sysdep-dl-routines += \ + dl-tlsdesc \ + tlsdesc \ + # sysdep-dl-routines + +gen-as-const-headers += \ + dl-link.sym \ + # gen-as-const-headers +endif + +ifeq ($(subdir),csu) +gen-as-const-headers += \ + tlsdesc.sym \ + # gen-as-const-headers endif ifeq ($(subdir),elf) sysdep-dl-routines += \ dl-get-cpu-features \ # sysdep-dl-routines + +# Disable the compiler from using LSX for TLS descriptor tests, or storing into +# 16B TLS variable may clobber FP/vector registers and prevent us from checking +# their contents. +ifeq (yes,$(loongarch-vec-com)) +CFLAGS-tst-gnu2-tls2mod0.c += -mno-lsx +CFLAGS-tst-gnu2-tls2mod1.c += -mno-lsx +CFLAGS-tst-gnu2-tls2mod2.c += -mno-lsx +endif endif # LoongArch's assembler also needs to know about PIC as it changes the # definition of some assembler macros. -ASFLAGS-.os += $(pic-ccflag) +ASFLAGS-.os += \ + $(pic-ccflag) \ + # ASFLAGS-.os # All the objects in lib*_nonshared.a need to be compiled with medium code # model or large applications may fail to link. ifeq (yes,$(have-cmodel-medium)) -CFLAGS-.oS += -mcmodel=medium +CFLAGS-.oS += \ + -mcmodel=medium \ + # CFLAGS-.oS endif diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure index 30b60d1983..afcef6cd22 100644 --- a/sysdeps/loongarch/configure +++ b/sysdeps/loongarch/configure @@ -110,3 +110,39 @@ if test $libc_cv_loongarch_vec_asm = no; then as_fn_error $? "binutils version is too old, use 2.41 or newer version" "$LINENO" 5 fi + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for vector support in compiler" >&5 +printf %s "checking for vector support in compiler... " >&6; } +if test ${libc_cv_loongarch_vec_com+y} +then : + printf %s "(cached) " >&6 +else $as_nop + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +void foo (void) +{ + asm volatile ("vldi \$vr0, 1" ::: "\$vr0"); + asm volatile ("xvldi \$xr0, 1" ::: "\$xr0"); +} + +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + libc_cv_loongarch_vec_com=yes +else $as_nop + libc_cv_loongarch_vec_com=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_com" >&5 +printf "%s\n" "$libc_cv_loongarch_vec_com" >&6; } +if test "$libc_cv_loongarch_vec_com" = yes ; +then + printf "%s\n" "#define HAVE_LOONGARCH_VEC_COM 1" >>confdefs.h + +fi +config_vars="$config_vars +loongarch-vec-com = $libc_cv_loongarch_vec_com" + diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac index 28a8ae5486..c56a203574 100644 --- a/sysdeps/loongarch/configure.ac +++ b/sysdeps/loongarch/configure.ac @@ -65,3 +65,20 @@ rm -f conftest*]) if test $libc_cv_loongarch_vec_asm = no; then AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version]) fi + +AC_CACHE_CHECK([for vector support in compiler], + libc_cv_loongarch_vec_com, [ +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +void foo (void) +{ + asm volatile ("vldi \$vr0, 1" ::: "\$vr0"); + asm volatile ("xvldi \$xr0, 1" ::: "\$xr0"); +} +]])], + [libc_cv_loongarch_vec_com=yes], + [libc_cv_loongarch_vec_com=no])]) +if test "$libc_cv_loongarch_vec_com" = yes ; +then + AC_DEFINE(HAVE_LOONGARCH_VEC_COM) +fi +LIBC_CONFIG_VAR([loongarch-vec-com], [$libc_cv_loongarch_vec_com]) diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h index 6baf0e600a..ab6f1da7c0 100644 --- a/sysdeps/loongarch/dl-machine.h +++ b/sysdeps/loongarch/dl-machine.h @@ -25,7 +25,7 @@ #include <entry.h> #include <elf/elf.h> #include <sys/asm.h> -#include <dl-tls.h> +#include <dl-tlsdesc.h> #include <dl-static-tls.h> #include <dl-machine-rel.h> @@ -206,6 +206,36 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend; break; + case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32: + { + struct tlsdesc volatile *td = (struct tlsdesc volatile *)addr_field; + if (sym == NULL) + { + td->arg = (void*)reloc->r_addend; + td->entry = _dl_tlsdesc_undefweak; + } + else + { +# ifndef SHARED + CHECK_STATIC_TLS (map, sym_map); +# else + if (!TRY_STATIC_TLS (map, sym_map)) + { + td->arg = _dl_make_tlsdesc_dynamic (sym_map, + sym->st_value + reloc->r_addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif + { + td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym) + + reloc->r_addend); + td->entry = _dl_tlsdesc_return; + } + } + break; + } + case R_LARCH_COPY: { if (sym == NULL) @@ -274,6 +304,26 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[], else *reloc_addr = map->l_mach.plt; } + else if (__glibc_likely (r_type == R_LARCH_TLS_DESC64) + || __glibc_likely (r_type == R_LARCH_TLS_DESC32)) + { + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info); + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]); + const ElfW (Sym) *sym = &symtab[symndx]; + const struct r_found_version *version = NULL; + + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW (Half) *vernum = (const void *)D_PTR (map, + l_info[VERSYMIDX (DT_VERSYM)]); + version = &map->l_versions[vernum[symndx] & 0x7fff]; + } + + /* Always initialize TLS descriptors completely, because lazy + initialization requires synchronization at every TLS access. */ + elf_machine_rela (map, scope, reloc, sym, version, reloc_addr, + skip_ifunc); + } else _dl_reloc_bad_type (map, r_type, 1); } diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h index 29924b866d..de593c002d 100644 --- a/sysdeps/loongarch/dl-tls.h +++ b/sysdeps/loongarch/dl-tls.h @@ -16,6 +16,9 @@ License along with the GNU C Library. If not, see <https://www.gnu.org/licenses/>. */ +#ifndef _DL_TLS_H +#define _DL_TLS_H + /* Type used for the representation of TLS information in the GOT. */ typedef struct { @@ -23,6 +26,8 @@ typedef struct unsigned long int ti_offset; } tls_index; +extern void *__tls_get_addr (tls_index *ti); + /* The thread pointer points to the first static TLS block. */ #define TLS_TP_OFFSET 0 @@ -37,10 +42,10 @@ typedef struct /* Compute the value for a DTPREL reloc. */ #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET) -extern void *__tls_get_addr (tls_index *ti); - #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET) #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET) /* Value used for dtv entries for which the allocation is delayed. */ #define TLS_DTV_UNALLOCATED ((void *) -1l) + +#endif diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S new file mode 100644 index 0000000000..15d5fa1c42 --- /dev/null +++ b/sysdeps/loongarch/dl-tlsdesc.S @@ -0,0 +1,436 @@ +/* Thread-local storage handling in the ELF dynamic linker. + LoongArch version. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <tls.h> +#include "tlsdesc.h" + + .text + + /* Compute the thread pointer offset for symbols in the static + TLS block. The offset is the same for all threads. + Prototype: + _dl_tlsdesc_return (tlsdesc *); */ + .hidden _dl_tlsdesc_return + .global _dl_tlsdesc_return + .type _dl_tlsdesc_return,%function + cfi_startproc + .align 2 +_dl_tlsdesc_return: + REG_L a0, a0, 8 + RET + cfi_endproc + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return + + /* Handler for undefined weak TLS symbols. + Prototype: + _dl_tlsdesc_undefweak (tlsdesc *); + + The second word of the descriptor contains the addend. + Return the addend minus the thread pointer. This ensures + that when the caller adds on the thread pointer it gets back + the addend. */ + .hidden _dl_tlsdesc_undefweak + .global _dl_tlsdesc_undefweak + .type _dl_tlsdesc_undefweak,%function + cfi_startproc + .align 2 +_dl_tlsdesc_undefweak: + REG_L a0, a0, 8 + sub.d a0, a0, tp + RET + cfi_endproc + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak + + +#ifdef SHARED + +#define FRAME_SIZE (-((-14 * SZREG) & ALMASK)) +#define FRAME_SIZE_LSX (-((-32 * SZVREG) & ALMASK)) +#define FRAME_SIZE_LASX (-((-32 * SZXREG) & ALMASK)) +#define FRAME_SIZE_FLOAT (-((-24 * SZFREG) & ALMASK)) + + /* Handler for dynamic TLS symbols. + Prototype: + _dl_tlsdesc_dynamic (tlsdesc *) ; + + The second word of the descriptor points to a + tlsdesc_dynamic_arg structure. + + Returns the offset between the thread pointer and the + object referenced by the argument. + + ptrdiff_t + _dl_tlsdesc_dynamic (struct tlsdesc *tdp) + { + struct tlsdesc_dynamic_arg *td = tdp->arg; + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer - TCBHEAD_DTV); + if (__glibc_likely (td->gen_count <= dtv[0].counter + && (dtv[td->tlsinfo.ti_module].pointer.val + != TLS_DTV_UNALLOCATED), + 1)) + return dtv[td->tlsinfo.ti_module].pointer.val + + td->tlsinfo.ti_offset + - __thread_pointer; + + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer; + } */ + .hidden _dl_tlsdesc_dynamic + .global _dl_tlsdesc_dynamic + .type _dl_tlsdesc_dynamic,%function + cfi_startproc + .align 2 +_dl_tlsdesc_dynamic: + /* Save just enough registers to support fast path, if we fall + into slow path we will save additional registers. */ + ADDI sp, sp, -24 + REG_S t0, sp, 0 + REG_S t1, sp, 8 + REG_S t2, sp, 16 + +/* Runtime Storage Layout of Thread-Local Storage + TP point to the start of TLS block. + + dtv +Low address TCB ----------------> dtv0(counter) + TP --> static_block0 <----- dtv1 + static_block1 <----- dtv2 + static_block2 <----- dtv3 + dynamic_block0 <----- dtv4 +Hign address dynamic_block1 <----- dtv5 */ + + REG_L t0, tp, -SIZE_OF_TCB /* t0 = dtv */ + REG_L a0, a0, TLSDESC_ARG /* a0(td) = tdp->arg */ + REG_L t1, a0, TLSDESC_GEN_COUNT /* t1 = td->gen_count */ + REG_L t2, t0, DTV_COUNTER /* t2 = dtv[0].counter */ + /* If dtv[0].counter < td->gen_count, goto slow path. */ + bltu t2, t1, .Lslow + + REG_L t1, a0, TLSDESC_MODID /* t1 = td->tlsinfo.ti_module */ + /* t1 = t1 * sizeof(dtv_t) = t1 * (2 * sizeof(void*)) */ + slli.d t1, t1, 4 + add.d t1, t1, t0 /* t1 = dtv[td->tlsinfo.ti_module] */ + REG_L t1, t1, 0 /* t1 = dtv[td->tlsinfo.ti_module].pointer.val */ + li.d t2, TLS_DTV_UNALLOCATED + /* If dtv[td->tlsinfo.ti_module].pointer.val is TLS_DTV_UNALLOCATED, + goto slow path. */ + beq t1, t2, .Lslow + + REG_L t2, a0, TLSDESC_MODOFF /* t2 = td->tlsinfo.ti_offset */ + /* dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset */ + add.d a0, t1, t2 +.Lret: + sub.d a0, a0, tp + REG_L t0, sp, 0 + REG_L t1, sp, 8 + REG_L t2, sp, 16 + ADDI sp, sp, 24 + RET + +.Lslow: + /* This is the slow path. We need to call __tls_get_addr() which + means we need to save and restore all the register that the + callee will trash. */ + + /* Save the remaining registers that we must treat as caller save. */ + ADDI sp, sp, -FRAME_SIZE + REG_S ra, sp, 0 * SZREG + REG_S a1, sp, 1 * SZREG + REG_S a2, sp, 2 * SZREG + REG_S a3, sp, 3 * SZREG + REG_S a4, sp, 4 * SZREG + REG_S a5, sp, 5 * SZREG + REG_S a6, sp, 6 * SZREG + REG_S a7, sp, 7 * SZREG + REG_S t3, sp, 8 * SZREG + REG_S t4, sp, 9 * SZREG + REG_S t5, sp, 10 * SZREG + REG_S t6, sp, 11 * SZREG + REG_S t7, sp, 12 * SZREG + REG_S t8, sp, 13 * SZREG + +#ifndef __loongarch_soft_float + + /* Save fcsr0 register. + Only one physical fcsr0 register, fcsr1-fcsr3 are aliases + of some fields in fcsr0. */ + ADDI sp, sp, -SZFCSREG + movfcsr2gr t0, fcsr0 + st.w t0, sp, 0 + + /* Whether support LASX. */ + la.global t0, _rtld_global_ro + REG_L t0, t0, GLRO_DL_HWCAP_OFFSET + andi t0, t0, HWCAP_LOONGARCH_LASX + beqz t0, .Llsx + + /* Save 256-bit vector registers. + FIXME: Without vector ABI, save all vector registers. */ + ADDI sp, sp, -FRAME_SIZE_LASX + xvst xr0, sp, 0*SZXREG + xvst xr1, sp, 1*SZXREG + xvst xr2, sp, 2*SZXREG + xvst xr3, sp, 3*SZXREG + xvst xr4, sp, 4*SZXREG + xvst xr5, sp, 5*SZXREG + xvst xr6, sp, 6*SZXREG + xvst xr7, sp, 7*SZXREG + xvst xr8, sp, 8*SZXREG + xvst xr9, sp, 9*SZXREG + xvst xr10, sp, 10*SZXREG + xvst xr11, sp, 11*SZXREG + xvst xr12, sp, 12*SZXREG + xvst xr13, sp, 13*SZXREG + xvst xr14, sp, 14*SZXREG + xvst xr15, sp, 15*SZXREG + xvst xr16, sp, 16*SZXREG + xvst xr17, sp, 17*SZXREG + xvst xr18, sp, 18*SZXREG + xvst xr19, sp, 19*SZXREG + xvst xr20, sp, 20*SZXREG + xvst xr21, sp, 21*SZXREG + xvst xr22, sp, 22*SZXREG + xvst xr23, sp, 23*SZXREG + xvst xr24, sp, 24*SZXREG + xvst xr25, sp, 25*SZXREG + xvst xr26, sp, 26*SZXREG + xvst xr27, sp, 27*SZXREG + xvst xr28, sp, 28*SZXREG + xvst xr29, sp, 29*SZXREG + xvst xr30, sp, 30*SZXREG + xvst xr31, sp, 31*SZXREG + b .Ltga + +.Llsx: + /* Whether support LSX. */ + andi t0, t0, HWCAP_LOONGARCH_LSX + beqz t0, .Lfloat + + /* Save 128-bit vector registers. */ + ADDI sp, sp, -FRAME_SIZE_LSX + vst vr0, sp, 0*SZVREG + vst vr1, sp, 1*SZVREG + vst vr2, sp, 2*SZVREG + vst vr3, sp, 3*SZVREG + vst vr4, sp, 4*SZVREG + vst vr5, sp, 5*SZVREG + vst vr6, sp, 6*SZVREG + vst vr7, sp, 7*SZVREG + vst vr8, sp, 8*SZVREG + vst vr9, sp, 9*SZVREG + vst vr10, sp, 10*SZVREG + vst vr11, sp, 11*SZVREG + vst vr12, sp, 12*SZVREG + vst vr13, sp, 13*SZVREG + vst vr14, sp, 14*SZVREG + vst vr15, sp, 15*SZVREG + vst vr16, sp, 16*SZVREG + vst vr17, sp, 17*SZVREG + vst vr18, sp, 18*SZVREG + vst vr19, sp, 19*SZVREG + vst vr20, sp, 20*SZVREG + vst vr21, sp, 21*SZVREG + vst vr22, sp, 22*SZVREG + vst vr23, sp, 23*SZVREG + vst vr24, sp, 24*SZVREG + vst vr25, sp, 25*SZVREG + vst vr26, sp, 26*SZVREG + vst vr27, sp, 27*SZVREG + vst vr28, sp, 28*SZVREG + vst vr29, sp, 29*SZVREG + vst vr30, sp, 30*SZVREG + vst vr31, sp, 31*SZVREG + b .Ltga + +.Lfloat: + /* Save float registers. */ + ADDI sp, sp, -FRAME_SIZE_FLOAT + FREG_S fa0, sp, 0*SZFREG + FREG_S fa1, sp, 1*SZFREG + FREG_S fa2, sp, 2*SZFREG + FREG_S fa3, sp, 3*SZFREG + FREG_S fa4, sp, 4*SZFREG + FREG_S fa5, sp, 5*SZFREG + FREG_S fa6, sp, 6*SZFREG + FREG_S fa7, sp, 7*SZFREG + FREG_S ft0, sp, 8*SZFREG + FREG_S ft1, sp, 9*SZFREG + FREG_S ft2, sp, 10*SZFREG + FREG_S ft3, sp, 11*SZFREG + FREG_S ft4, sp, 12*SZFREG + FREG_S ft5, sp, 13*SZFREG + FREG_S ft6, sp, 14*SZFREG + FREG_S ft7, sp, 15*SZFREG + FREG_S ft8, sp, 16*SZFREG + FREG_S ft9, sp, 17*SZFREG + FREG_S ft10, sp, 18*SZFREG + FREG_S ft11, sp, 19*SZFREG + FREG_S ft12, sp, 20*SZFREG + FREG_S ft13, sp, 21*SZFREG + FREG_S ft14, sp, 22*SZFREG + FREG_S ft15, sp, 23*SZFREG + +#endif /* #ifndef __loongarch_soft_float */ + +.Ltga: + bl HIDDEN_JUMPTARGET(__tls_get_addr) + ADDI a0, a0, -TLS_DTV_OFFSET + +#ifndef __loongarch_soft_float + + la.global t0, _rtld_global_ro + REG_L t0, t0, GLRO_DL_HWCAP_OFFSET + andi t0, t0, HWCAP_LOONGARCH_LASX + beqz t0, .Llsx1 + + /* Restore 256-bit vector registers. */ + xvld xr0, sp, 0*SZXREG + xvld xr1, sp, 1*SZXREG + xvld xr2, sp, 2*SZXREG + xvld xr3, sp, 3*SZXREG + xvld xr4, sp, 4*SZXREG + xvld xr5, sp, 5*SZXREG + xvld xr6, sp, 6*SZXREG + xvld xr7, sp, 7*SZXREG + xvld xr8, sp, 8*SZXREG + xvld xr9, sp, 9*SZXREG + xvld xr10, sp, 10*SZXREG + xvld xr11, sp, 11*SZXREG + xvld xr12, sp, 12*SZXREG + xvld xr13, sp, 13*SZXREG + xvld xr14, sp, 14*SZXREG + xvld xr15, sp, 15*SZXREG + xvld xr16, sp, 16*SZXREG + xvld xr17, sp, 17*SZXREG + xvld xr18, sp, 18*SZXREG + xvld xr19, sp, 19*SZXREG + xvld xr20, sp, 20*SZXREG + xvld xr21, sp, 21*SZXREG + xvld xr22, sp, 22*SZXREG + xvld xr23, sp, 23*SZXREG + xvld xr24, sp, 24*SZXREG + xvld xr25, sp, 25*SZXREG + xvld xr26, sp, 26*SZXREG + xvld xr27, sp, 27*SZXREG + xvld xr28, sp, 28*SZXREG + xvld xr29, sp, 29*SZXREG + xvld xr30, sp, 30*SZXREG + xvld xr31, sp, 31*SZXREG + ADDI sp, sp, FRAME_SIZE_LASX + b .Lfcsr + +.Llsx1: + andi t0, s0, HWCAP_LOONGARCH_LSX + beqz t0, .Lfloat1 + + /* Restore 128-bit vector registers. */ + vld vr0, sp, 0*SZVREG + vld vr1, sp, 1*SZVREG + vld vr2, sp, 2*SZVREG + vld vr3, sp, 3*SZVREG + vld vr4, sp, 4*SZVREG + vld vr5, sp, 5*SZVREG + vld vr6, sp, 6*SZVREG + vld vr7, sp, 7*SZVREG + vld vr8, sp, 8*SZVREG + vld vr9, sp, 9*SZVREG + vld vr10, sp, 10*SZVREG + vld vr11, sp, 11*SZVREG + vld vr12, sp, 12*SZVREG + vld vr13, sp, 13*SZVREG + vld vr14, sp, 14*SZVREG + vld vr15, sp, 15*SZVREG + vld vr16, sp, 16*SZVREG + vld vr17, sp, 17*SZVREG + vld vr18, sp, 18*SZVREG + vld vr19, sp, 19*SZVREG + vld vr20, sp, 20*SZVREG + vld vr21, sp, 21*SZVREG + vld vr22, sp, 22*SZVREG + vld vr23, sp, 23*SZVREG + vld vr24, sp, 24*SZVREG + vld vr25, sp, 25*SZVREG + vld vr26, sp, 26*SZVREG + vld vr27, sp, 27*SZVREG + vld vr28, sp, 28*SZVREG + vld vr29, sp, 29*SZVREG + vld vr30, sp, 30*SZVREG + vld vr31, sp, 31*SZVREG + ADDI sp, sp, FRAME_SIZE_LSX + b .Lfcsr + +.Lfloat1: + /* Restore float registers. */ + FREG_L fa0, sp, 0*SZFREG + FREG_L fa1, sp, 1*SZFREG + FREG_L fa2, sp, 2*SZFREG + FREG_L fa3, sp, 3*SZFREG + FREG_L fa4, sp, 4*SZFREG + FREG_L fa5, sp, 5*SZFREG + FREG_L fa6, sp, 6*SZFREG + FREG_L fa7, sp, 7*SZFREG + FREG_L ft0, sp, 8*SZFREG + FREG_L ft1, sp, 9*SZFREG + FREG_L ft2, sp, 10*SZFREG + FREG_L ft3, sp, 11*SZFREG + FREG_L ft4, sp, 12*SZFREG + FREG_L ft5, sp, 13*SZFREG + FREG_L ft6, sp, 14*SZFREG + FREG_L ft7, sp, 15*SZFREG + FREG_L ft8, sp, 16*SZFREG + FREG_L ft9, sp, 17*SZFREG + FREG_L ft10, sp, 18*SZFREG + FREG_L ft11, sp, 19*SZFREG + FREG_L ft12, sp, 20*SZFREG + FREG_L ft13, sp, 21*SZFREG + FREG_L ft14, sp, 22*SZFREG + FREG_L ft15, sp, 23*SZFREG + ADDI sp, sp, FRAME_SIZE_FLOAT + +.Lfcsr: + /* Restore fcsr0 register. */ + ld.w t0, sp, 0 + movgr2fcsr fcsr0, t0 + ADDI sp, sp, SZFCSREG + +#endif /* #ifndef __loongarch_soft_float */ + + REG_L ra, sp, 0 * SZREG + REG_L a1, sp, 1 * SZREG + REG_L a2, sp, 2 * SZREG + REG_L a3, sp, 3 * SZREG + REG_L a4, sp, 4 * SZREG + REG_L a5, sp, 5 * SZREG + REG_L a6, sp, 6 * SZREG + REG_L a7, sp, 7 * SZREG + REG_L t3, sp, 8 * SZREG + REG_L t4, sp, 9 * SZREG + REG_L t5, sp, 10 * SZREG + REG_L t6, sp, 11 * SZREG + REG_L t7, sp, 12 * SZREG + REG_L t8, sp, 13 * SZREG + ADDI sp, sp, FRAME_SIZE + + b .Lret + cfi_endproc + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic + .hidden HIDDEN_JUMPTARGET(__tls_get_addr) + +#endif /* #ifdef SHARED */ diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h new file mode 100644 index 0000000000..ff8c69cb93 --- /dev/null +++ b/sysdeps/loongarch/dl-tlsdesc.h @@ -0,0 +1,49 @@ +/* Thread-local storage descriptor handling in the ELF dynamic linker. + LoongArch version. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _DL_TLSDESC_H +#define _DL_TLSDESC_H + +#include <dl-tls.h> + +/* Type used to represent a TLS descriptor in the GOT. */ +struct tlsdesc +{ + ptrdiff_t (*entry) (struct tlsdesc *); + void *arg; +}; + +/* Type used as the argument in a TLS descriptor for a symbol that + needs dynamic TLS offsets. */ +struct tlsdesc_dynamic_arg +{ + tls_index tlsinfo; + size_t gen_count; +}; + +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *); +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *); + +#ifdef SHARED +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t); +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *); +#endif + +#endif diff --git a/sysdeps/loongarch/fpu_control.h b/sysdeps/loongarch/fpu_control.h index 54add4e01c..3cdf2417d9 100644 --- a/sysdeps/loongarch/fpu_control.h +++ b/sysdeps/loongarch/fpu_control.h @@ -91,8 +91,8 @@ typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__))); /* Macros for accessing the hardware control word. */ extern fpu_control_t __loongarch_fpu_getcw (void) __THROW; extern void __loongarch_fpu_setcw (fpu_control_t) __THROW; -#define _FPU_GETCW(cw) __asm__ volatile ("movfcsr2gr %0,$r0" : "=r"(cw)) -#define _FPU_SETCW(cw) __asm__ volatile ("movgr2fcsr $r0,%0" : : "r"(cw)) +#define _FPU_GETCW(cw) __asm__ volatile ("movfcsr2gr %0,$fcsr0" : "=r"(cw)) +#define _FPU_SETCW(cw) __asm__ volatile ("movgr2fcsr $fcsr0,%0" : : "r"(cw)) /* Default control word set at startup. */ extern fpu_control_t __fpu_control; diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h index 4d8737ee7f..833dc9eb82 100644 --- a/sysdeps/loongarch/linkmap.h +++ b/sysdeps/loongarch/linkmap.h @@ -18,5 +18,6 @@ struct link_map_machine { - ElfW (Addr) plt; /* Address of .plt. */ + ElfW (Addr) plt; /* Address of .plt. */ + void *tlsdesc_table; /* Address of TLS descriptor hash table. */ }; diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps index 770bf36b11..185c5b4e92 100644 --- a/sysdeps/loongarch/lp64/libm-test-ulps +++ b/sysdeps/loongarch/lp64/libm-test-ulps @@ -1202,6 +1202,26 @@ Function: "log2_upward": double: 3 ldouble: 1 +Function: "log2p1": +double: 1 +float: 1 +ldouble: 3 + +Function: "log2p1_downward": +double: 2 +float: 2 +ldouble: 3 + +Function: "log2p1_towardzero": +double: 2 +float: 2 +ldouble: 2 + +Function: "log2p1_upward": +double: 1 +float: 2 +ldouble: 2 + Function: "log_downward": ldouble: 1 diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S index 9826d21c2e..3606225ca2 100644 --- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S @@ -98,5 +98,7 @@ L(out): jr ra END(STRNLEN) +#if !IS_IN (libc) weak_alias (STRNLEN, strnlen) libc_hidden_builtin_def (STRNLEN) +#endif diff --git a/sysdeps/loongarch/preconfigure b/sysdeps/loongarch/preconfigure index dfc7ecfd9e..0d1e9ed8df 100644 --- a/sysdeps/loongarch/preconfigure +++ b/sysdeps/loongarch/preconfigure @@ -43,6 +43,7 @@ loongarch*) base_machine=loongarch + mtls_descriptor=desc ;; esac diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h index 51521a7eb4..23c1d12914 100644 --- a/sysdeps/loongarch/sys/asm.h +++ b/sysdeps/loongarch/sys/asm.h @@ -25,6 +25,7 @@ /* Macros to handle different pointer/register sizes for 32/64-bit code. */ #define SZREG 8 #define SZFREG 8 +#define SZFCSREG 4 #define SZVREG 16 #define SZXREG 32 #define REG_L ld.d diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h index f61ee25b25..80ce3e9c00 100644 --- a/sysdeps/loongarch/sys/regdef.h +++ b/sysdeps/loongarch/sys/regdef.h @@ -97,6 +97,7 @@ #define fcc5 $fcc5 #define fcc6 $fcc6 #define fcc7 $fcc7 +#define fcsr0 $fcsr0 #define vr0 $vr0 #define vr1 $vr1 diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c new file mode 100644 index 0000000000..76708f7e1e --- /dev/null +++ b/sysdeps/loongarch/tlsdesc.c @@ -0,0 +1,39 @@ +/* Manage TLS descriptors. LoongArch64 version. + + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <ldsodefs.h> +#include <tls.h> +#include <dl-tlsdesc.h> +#include <dl-unmap-segments.h> +#include <tlsdeschtab.h> + +/* Unmap the dynamic object, but also release its TLS descriptor table + if there is one. */ + +void +_dl_unmap (struct link_map *map) +{ + _dl_unmap_segments (map); + +#ifdef SHARED + if (map->l_mach.tlsdesc_table) + htab_delete (map->l_mach.tlsdesc_table); +#endif +} diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym new file mode 100644 index 0000000000..213d0b3074 --- /dev/null +++ b/sysdeps/loongarch/tlsdesc.sym @@ -0,0 +1,28 @@ +#include <stddef.h> +#include <sysdep.h> +#include <tls.h> +#include <link.h> +#include <dl-tlsdesc.h> + +#define SHARED 1 + +#include <ldsodefs.h> + +#define GLRO_offsetof(name) offsetof (struct rtld_global_ro, _##name) + +-- + +-- Abuse tls.h macros to derive offsets relative to the thread register. + +TLSDESC_ARG offsetof(struct tlsdesc, arg) +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) +TCBHEAD_DTV offsetof(tcbhead_t, dtv) +DTV_COUNTER offsetof(dtv_t, counter) +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED +TLS_DTV_OFFSET TLS_DTV_OFFSET +SIZE_OF_TCB sizeof(tcbhead_t) +GLRO_DL_HWCAP_OFFSET GLRO_offsetof (dl_hwcap) +HWCAP_LOONGARCH_LSX HWCAP_LOONGARCH_LSX +HWCAP_LOONGARCH_LASX HWCAP_LOONGARCH_LASX diff --git a/sysdeps/loongarch/tst-gnu2-tls2.h b/sysdeps/loongarch/tst-gnu2-tls2.h new file mode 100644 index 0000000000..8e4216785d --- /dev/null +++ b/sysdeps/loongarch/tst-gnu2-tls2.h @@ -0,0 +1,377 @@ +/* Test TLSDESC relocation. LoongArch64 version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <string.h> +#include <stdlib.h> +#include <sys/auxv.h> + +/* The instruction between BEFORE_TLSDESC_CALL and _dl_tlsdesc_dynamic, + and the instruction between _dl_tlsdesc_dynamic and AFTER_TLSDESC_CALL, + may modified most of the general-purpose register. */ +#define SAVE_REGISTER(src) \ + asm volatile ("st.d $r3, %0" :"=m"(src) :); + +#ifdef __loongarch_soft_float + +#define BEFORE_TLSDESC_CALL() \ + uint64_t src; \ + SAVE_REGISTER (src); + +#define AFTER_TLSDESC_CALL() \ + uint64_t restore; \ + SAVE_REGISTER (restore); \ + if (src != restore) \ + abort (); + +#else /* hard float */ + +#define SAVE_REGISTER_FCC(src) \ + asm volatile ("movcf2gr $t0, $fcc0" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[0]) :); \ + asm volatile ("movcf2gr $t0, $fcc1" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[1]) :); \ + asm volatile ("movcf2gr $t0, $fcc2" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[2]) :); \ + asm volatile ("movcf2gr $t0, $fcc3" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[3]) :); \ + asm volatile ("movcf2gr $t0, $fcc4" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[4]) :); \ + asm volatile ("movcf2gr $t0, $fcc5" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[5]) :); \ + asm volatile ("movcf2gr $t0, $fcc6" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[6]) :); \ + asm volatile ("movcf2gr $t0, $fcc7" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(src[7]) :); + +#define LOAD_REGISTER_FCSR() \ + asm volatile ("li.d $t0, 0x01010101" ::: "$t0"); \ + asm volatile ("movgr2fcsr $fcsr0, $t0" :::); + +#define SAVE_REGISTER_FCSR() \ + asm volatile ("movfcsr2gr $t0, $fcsr0" ::: "$t0"); \ + asm volatile ("st.d $t0, %0" :"=m"(restore_fcsr) :); + +# define INIT_TLSDESC_CALL() \ + unsigned long hwcap = getauxval (AT_HWCAP); + +#define LOAD_REGISTER_FLOAT() \ + asm volatile ("fld.d $f0, %0" ::"m"(src_float[0]) :"$f0"); \ + asm volatile ("fld.d $f1, %0" ::"m"(src_float[1]) :"$f1"); \ + asm volatile ("fld.d $f2, %0" ::"m"(src_float[2]) :"$f2"); \ + asm volatile ("fld.d $f3, %0" ::"m"(src_float[3]) :"$f3"); \ + asm volatile ("fld.d $f4, %0" ::"m"(src_float[4]) :"$f4"); \ + asm volatile ("fld.d $f5, %0" ::"m"(src_float[5]) :"$f5"); \ + asm volatile ("fld.d $f6, %0" ::"m"(src_float[6]) :"$f6"); \ + asm volatile ("fld.d $f7, %0" ::"m"(src_float[7]) :"$f7"); \ + asm volatile ("fld.d $f8, %0" ::"m"(src_float[8]) :"$f8"); \ + asm volatile ("fld.d $f9, %0" ::"m"(src_float[9]) :"$f9"); \ + asm volatile ("fld.d $f10, %0" ::"m"(src_float[10]) :"$f10"); \ + asm volatile ("fld.d $f11, %0" ::"m"(src_float[11]) :"$f11"); \ + asm volatile ("fld.d $f12, %0" ::"m"(src_float[12]) :"$f12"); \ + asm volatile ("fld.d $f13, %0" ::"m"(src_float[13]) :"$f13"); \ + asm volatile ("fld.d $f14, %0" ::"m"(src_float[14]) :"$f14"); \ + asm volatile ("fld.d $f15, %0" ::"m"(src_float[15]) :"$f15"); \ + asm volatile ("fld.d $f16, %0" ::"m"(src_float[16]) :"$f16"); \ + asm volatile ("fld.d $f17, %0" ::"m"(src_float[17]) :"$f17"); \ + asm volatile ("fld.d $f18, %0" ::"m"(src_float[18]) :"$f18"); \ + asm volatile ("fld.d $f19, %0" ::"m"(src_float[19]) :"$f19"); \ + asm volatile ("fld.d $f20, %0" ::"m"(src_float[20]) :"$f20"); \ + asm volatile ("fld.d $f21, %0" ::"m"(src_float[21]) :"$f21"); \ + asm volatile ("fld.d $f22, %0" ::"m"(src_float[22]) :"$f22"); \ + asm volatile ("fld.d $f23, %0" ::"m"(src_float[23]) :"$f23"); \ + asm volatile ("fld.d $f24, %0" ::"m"(src_float[24]) :"$f24"); \ + asm volatile ("fld.d $f25, %0" ::"m"(src_float[25]) :"$f25"); \ + asm volatile ("fld.d $f26, %0" ::"m"(src_float[26]) :"$f26"); \ + asm volatile ("fld.d $f27, %0" ::"m"(src_float[27]) :"$f27"); \ + asm volatile ("fld.d $f28, %0" ::"m"(src_float[28]) :"$f28"); \ + asm volatile ("fld.d $f29, %0" ::"m"(src_float[29]) :"$f29"); \ + asm volatile ("fld.d $f30, %0" ::"m"(src_float[30]) :"$f30"); \ + asm volatile ("fld.d $f31, %0" ::"m"(src_float[31]) :"$f31"); + +#define SAVE_REGISTER_FLOAT() \ + asm volatile ("fst.d $f0, %0" :"=m"(restore_float[0]) :); \ + asm volatile ("fst.d $f1, %0" :"=m"(restore_float[1]) :); \ + asm volatile ("fst.d $f2, %0" :"=m"(restore_float[2]) :); \ + asm volatile ("fst.d $f3, %0" :"=m"(restore_float[3]) :); \ + asm volatile ("fst.d $f4, %0" :"=m"(restore_float[4]) :); \ + asm volatile ("fst.d $f5, %0" :"=m"(restore_float[5]) :); \ + asm volatile ("fst.d $f6, %0" :"=m"(restore_float[6]) :); \ + asm volatile ("fst.d $f7, %0" :"=m"(restore_float[7]) :); \ + asm volatile ("fst.d $f8, %0" :"=m"(restore_float[8]) :); \ + asm volatile ("fst.d $f9, %0" :"=m"(restore_float[9]) :); \ + asm volatile ("fst.d $f10, %0" :"=m"(restore_float[10]) :); \ + asm volatile ("fst.d $f11, %0" :"=m"(restore_float[11]) :); \ + asm volatile ("fst.d $f12, %0" :"=m"(restore_float[12]) :); \ + asm volatile ("fst.d $f13, %0" :"=m"(restore_float[13]) :); \ + asm volatile ("fst.d $f14, %0" :"=m"(restore_float[14]) :); \ + asm volatile ("fst.d $f15, %0" :"=m"(restore_float[15]) :); \ + asm volatile ("fst.d $f16, %0" :"=m"(restore_float[16]) :); \ + asm volatile ("fst.d $f17, %0" :"=m"(restore_float[17]) :); \ + asm volatile ("fst.d $f18, %0" :"=m"(restore_float[18]) :); \ + asm volatile ("fst.d $f19, %0" :"=m"(restore_float[19]) :); \ + asm volatile ("fst.d $f20, %0" :"=m"(restore_float[20]) :); \ + asm volatile ("fst.d $f21, %0" :"=m"(restore_float[21]) :); \ + asm volatile ("fst.d $f22, %0" :"=m"(restore_float[22]) :); \ + asm volatile ("fst.d $f23, %0" :"=m"(restore_float[23]) :); \ + asm volatile ("fst.d $f24, %0" :"=m"(restore_float[24]) :); \ + asm volatile ("fst.d $f25, %0" :"=m"(restore_float[25]) :); \ + asm volatile ("fst.d $f26, %0" :"=m"(restore_float[26]) :); \ + asm volatile ("fst.d $f27, %0" :"=m"(restore_float[27]) :); \ + asm volatile ("fst.d $f28, %0" :"=m"(restore_float[28]) :); \ + asm volatile ("fst.d $f29, %0" :"=m"(restore_float[29]) :); \ + asm volatile ("fst.d $f30, %0" :"=m"(restore_float[30]) :); \ + asm volatile ("fst.d $f31, %0" :"=m"(restore_float[31]) :); + +#ifdef HAVE_LOONGARCH_VEC_COM + #define LOAD_REGISTER_LSX() \ + /* Every byte in $vr0 is 1. */ \ + asm volatile ("vldi $vr0, 1" ::: "$vr0"); \ + asm volatile ("vldi $vr1, 2" ::: "$vr1"); \ + asm volatile ("vldi $vr2, 3" ::: "$vr2"); \ + asm volatile ("vldi $vr3, 4" ::: "$vr3"); \ + asm volatile ("vldi $vr4, 5" ::: "$vr4"); \ + asm volatile ("vldi $vr5, 6" ::: "$vr5"); \ + asm volatile ("vldi $vr6, 7" ::: "$vr6"); \ + asm volatile ("vldi $vr7, 8" ::: "$vr7"); \ + asm volatile ("vldi $vr8, 9" ::: "$vr8"); \ + asm volatile ("vldi $vr9, 10" ::: "$vr9"); \ + asm volatile ("vldi $vr10, 11" ::: "$vr10"); \ + asm volatile ("vldi $vr11, 12" ::: "$vr11"); \ + asm volatile ("vldi $vr12, 13" ::: "$vr12"); \ + asm volatile ("vldi $vr13, 14" ::: "$vr13"); \ + asm volatile ("vldi $vr14, 15" ::: "$vr14"); \ + asm volatile ("vldi $vr15, 16" ::: "$vr15"); \ + asm volatile ("vldi $vr16, 17" ::: "$vr16"); \ + asm volatile ("vldi $vr17, 18" ::: "$vr17"); \ + asm volatile ("vldi $vr18, 19" ::: "$vr18"); \ + asm volatile ("vldi $vr19, 20" ::: "$vr19"); \ + asm volatile ("vldi $vr20, 21" ::: "$vr20"); \ + asm volatile ("vldi $vr21, 22" ::: "$vr21"); \ + asm volatile ("vldi $vr22, 23" ::: "$vr22"); \ + asm volatile ("vldi $vr23, 24" ::: "$vr23"); \ + asm volatile ("vldi $vr24, 25" ::: "$vr24"); \ + asm volatile ("vldi $vr25, 26" ::: "$vr25"); \ + asm volatile ("vldi $vr26, 27" ::: "$vr26"); \ + asm volatile ("vldi $vr27, 28" ::: "$vr27"); \ + asm volatile ("vldi $vr28, 29" ::: "$vr28"); \ + asm volatile ("vldi $vr29, 30" ::: "$vr29"); \ + asm volatile ("vldi $vr30, 31" ::: "$vr30"); \ + asm volatile ("vldi $vr31, 32" ::: "$vr31"); +#else + #define LOAD_REGISTER_LSX() +#endif + +#ifdef HAVE_LOONGARCH_VEC_COM + #define SAVE_REGISTER_LSX() \ + int src_lsx[32][4]; \ + int restore_lsx[32][4]; \ + asm volatile ("vst $vr0, %0" :"=m"(restore_lsx[0]) :); \ + asm volatile ("vst $vr1, %0" :"=m"(restore_lsx[1]) :); \ + asm volatile ("vst $vr2, %0" :"=m"(restore_lsx[2]) :); \ + asm volatile ("vst $vr3, %0" :"=m"(restore_lsx[3]) :); \ + asm volatile ("vst $vr4, %0" :"=m"(restore_lsx[4]) :); \ + asm volatile ("vst $vr5, %0" :"=m"(restore_lsx[5]) :); \ + asm volatile ("vst $vr6, %0" :"=m"(restore_lsx[6]) :); \ + asm volatile ("vst $vr7, %0" :"=m"(restore_lsx[7]) :); \ + asm volatile ("vst $vr8, %0" :"=m"(restore_lsx[8]) :); \ + asm volatile ("vst $vr9, %0" :"=m"(restore_lsx[9]) :); \ + asm volatile ("vst $vr10, %0" :"=m"(restore_lsx[10]) :); \ + asm volatile ("vst $vr11, %0" :"=m"(restore_lsx[11]) :); \ + asm volatile ("vst $vr12, %0" :"=m"(restore_lsx[12]) :); \ + asm volatile ("vst $vr13, %0" :"=m"(restore_lsx[13]) :); \ + asm volatile ("vst $vr14, %0" :"=m"(restore_lsx[14]) :); \ + asm volatile ("vst $vr15, %0" :"=m"(restore_lsx[15]) :); \ + asm volatile ("vst $vr16, %0" :"=m"(restore_lsx[16]) :); \ + asm volatile ("vst $vr17, %0" :"=m"(restore_lsx[17]) :); \ + asm volatile ("vst $vr18, %0" :"=m"(restore_lsx[18]) :); \ + asm volatile ("vst $vr19, %0" :"=m"(restore_lsx[19]) :); \ + asm volatile ("vst $vr20, %0" :"=m"(restore_lsx[20]) :); \ + asm volatile ("vst $vr21, %0" :"=m"(restore_lsx[21]) :); \ + asm volatile ("vst $vr22, %0" :"=m"(restore_lsx[22]) :); \ + asm volatile ("vst $vr23, %0" :"=m"(restore_lsx[23]) :); \ + asm volatile ("vst $vr24, %0" :"=m"(restore_lsx[24]) :); \ + asm volatile ("vst $vr25, %0" :"=m"(restore_lsx[25]) :); \ + asm volatile ("vst $vr26, %0" :"=m"(restore_lsx[26]) :); \ + asm volatile ("vst $vr27, %0" :"=m"(restore_lsx[27]) :); \ + asm volatile ("vst $vr28, %0" :"=m"(restore_lsx[28]) :); \ + asm volatile ("vst $vr29, %0" :"=m"(restore_lsx[29]) :); \ + asm volatile ("vst $vr30, %0" :"=m"(restore_lsx[30]) :); \ + asm volatile ("vst $vr31, %0" :"=m"(restore_lsx[31]) :); \ + for (int i = 0; i < 32; i++) \ + for (int j = 0; j < 4; j++) \ + { \ + src_lsx[i][j] = 0x01010101 * (i + 1); \ + if (src_lsx[i][j] != restore_lsx[i][j]) \ + abort (); \ + } +#else + #define SAVE_REGISTER_LSX() +#endif + +#ifdef HAVE_LOONGARCH_VEC_COM + #define LOAD_REGISTER_LASX() \ + /* Every byte in $xr0 is 1. */ \ + asm volatile ("xvldi $xr0, 1" ::: "$xr0"); \ + asm volatile ("xvldi $xr1, 2" ::: "$xr1"); \ + asm volatile ("xvldi $xr2, 3" ::: "$xr2"); \ + asm volatile ("xvldi $xr3, 4" ::: "$xr3"); \ + asm volatile ("xvldi $xr4, 5" ::: "$xr4"); \ + asm volatile ("xvldi $xr5, 6" ::: "$xr5"); \ + asm volatile ("xvldi $xr6, 7" ::: "$xr6"); \ + asm volatile ("xvldi $xr7, 8" ::: "$xr7"); \ + asm volatile ("xvldi $xr8, 9" ::: "$xr8"); \ + asm volatile ("xvldi $xr9, 10" ::: "$xr9"); \ + asm volatile ("xvldi $xr10, 11" ::: "$xr10"); \ + asm volatile ("xvldi $xr11, 12" ::: "$xr11"); \ + asm volatile ("xvldi $xr12, 13" ::: "$xr12"); \ + asm volatile ("xvldi $xr13, 14" ::: "$xr13"); \ + asm volatile ("xvldi $xr14, 15" ::: "$xr14"); \ + asm volatile ("xvldi $xr15, 16" ::: "$xr15"); \ + asm volatile ("xvldi $xr16, 17" ::: "$xr16"); \ + asm volatile ("xvldi $xr17, 18" ::: "$xr17"); \ + asm volatile ("xvldi $xr18, 19" ::: "$xr18"); \ + asm volatile ("xvldi $xr19, 20" ::: "$xr19"); \ + asm volatile ("xvldi $xr20, 21" ::: "$xr20"); \ + asm volatile ("xvldi $xr21, 22" ::: "$xr21"); \ + asm volatile ("xvldi $xr22, 23" ::: "$xr22"); \ + asm volatile ("xvldi $xr23, 24" ::: "$xr23"); \ + asm volatile ("xvldi $xr24, 25" ::: "$xr24"); \ + asm volatile ("xvldi $xr25, 26" ::: "$xr25"); \ + asm volatile ("xvldi $xr26, 27" ::: "$xr26"); \ + asm volatile ("xvldi $xr27, 28" ::: "$xr27"); \ + asm volatile ("xvldi $xr28, 29" ::: "$xr28"); \ + asm volatile ("xvldi $xr29, 30" ::: "$xr29"); \ + asm volatile ("xvldi $xr30, 31" ::: "$xr30"); \ + asm volatile ("xvldi $xr31, 32" ::: "$xr31"); +#else + #define LOAD_REGISTER_LASX() +#endif + +#ifdef HAVE_LOONGARCH_VEC_COM + #define SAVE_REGISTER_LASX() \ + int src_lasx[32][8]; \ + int restore_lasx[32][8]; \ + asm volatile ("xvst $xr0, %0" :"=m"(restore_lasx[0]) :); \ + asm volatile ("xvst $xr1, %0" :"=m"(restore_lasx[1]) :); \ + asm volatile ("xvst $xr2, %0" :"=m"(restore_lasx[2]) :); \ + asm volatile ("xvst $xr3, %0" :"=m"(restore_lasx[3]) :); \ + asm volatile ("xvst $xr4, %0" :"=m"(restore_lasx[4]) :); \ + asm volatile ("xvst $xr5, %0" :"=m"(restore_lasx[5]) :); \ + asm volatile ("xvst $xr6, %0" :"=m"(restore_lasx[6]) :); \ + asm volatile ("xvst $xr7, %0" :"=m"(restore_lasx[7]) :); \ + asm volatile ("xvst $xr8, %0" :"=m"(restore_lasx[8]) :); \ + asm volatile ("xvst $xr9, %0" :"=m"(restore_lasx[9]) :); \ + asm volatile ("xvst $xr10, %0" :"=m"(restore_lasx[10]) :); \ + asm volatile ("xvst $xr11, %0" :"=m"(restore_lasx[11]) :); \ + asm volatile ("xvst $xr12, %0" :"=m"(restore_lasx[12]) :); \ + asm volatile ("xvst $xr13, %0" :"=m"(restore_lasx[13]) :); \ + asm volatile ("xvst $xr14, %0" :"=m"(restore_lasx[14]) :); \ + asm volatile ("xvst $xr15, %0" :"=m"(restore_lasx[15]) :); \ + asm volatile ("xvst $xr16, %0" :"=m"(restore_lasx[16]) :); \ + asm volatile ("xvst $xr17, %0" :"=m"(restore_lasx[17]) :); \ + asm volatile ("xvst $xr18, %0" :"=m"(restore_lasx[18]) :); \ + asm volatile ("xvst $xr19, %0" :"=m"(restore_lasx[19]) :); \ + asm volatile ("xvst $xr20, %0" :"=m"(restore_lasx[20]) :); \ + asm volatile ("xvst $xr21, %0" :"=m"(restore_lasx[21]) :); \ + asm volatile ("xvst $xr22, %0" :"=m"(restore_lasx[22]) :); \ + asm volatile ("xvst $xr23, %0" :"=m"(restore_lasx[23]) :); \ + asm volatile ("xvst $xr24, %0" :"=m"(restore_lasx[24]) :); \ + asm volatile ("xvst $xr25, %0" :"=m"(restore_lasx[25]) :); \ + asm volatile ("xvst $xr26, %0" :"=m"(restore_lasx[26]) :); \ + asm volatile ("xvst $xr27, %0" :"=m"(restore_lasx[27]) :); \ + asm volatile ("xvst $xr28, %0" :"=m"(restore_lasx[28]) :); \ + asm volatile ("xvst $xr29, %0" :"=m"(restore_lasx[29]) :); \ + asm volatile ("xvst $xr30, %0" :"=m"(restore_lasx[30]) :); \ + asm volatile ("xvst $xr31, %0" :"=m"(restore_lasx[31]) :); \ + for (int i = 0; i < 32; i++) \ + for (int j = 0; j < 8; j++) \ + { \ + src_lasx[i][j] = 0x01010101 * (i + 1); \ + if (src_lasx[i][j] != restore_lasx[i][j]) \ + abort (); \ + } +#else + #define SAVE_REGISTER_LASX() +#endif + +#define BEFORE_TLSDESC_CALL() \ + uint64_t src; \ + double src_float[32]; \ + uint64_t src_fcc[8]; \ + for (int i = 0; i < 32; i++) \ + src_float[i] = i + 1; \ + \ + SAVE_REGISTER (src); \ + LOAD_REGISTER_FCSR (); \ + SAVE_REGISTER_FCC(src_fcc) \ + \ + if (hwcap & HWCAP_LOONGARCH_LASX) \ + { \ + LOAD_REGISTER_LASX (); \ + } \ + else if (hwcap & HWCAP_LOONGARCH_LSX) \ + { \ + LOAD_REGISTER_LSX (); \ + } \ + else \ + { \ + LOAD_REGISTER_FLOAT (); \ + } + +#define AFTER_TLSDESC_CALL() \ + uint64_t restore; \ + uint64_t src_fcsr = 0x01010101; \ + uint64_t restore_fcsr; \ + uint64_t restore_fcc[8]; \ + SAVE_REGISTER (restore); \ + SAVE_REGISTER_FCSR (); \ + SAVE_REGISTER_FCC(restore_fcc) \ + \ + /* memcmp_lasx/strlen_lasx corrupts LSX/LASX registers, */ \ + /* compare LSX/LASX registers first. */ \ + if (hwcap & HWCAP_LOONGARCH_LASX) \ + { \ + SAVE_REGISTER_LASX (); \ + } \ + else if (hwcap & HWCAP_LOONGARCH_LSX) \ + { \ + SAVE_REGISTER_LSX (); \ + } \ + else \ + { \ + double restore_float[32]; \ + SAVE_REGISTER_FLOAT (); \ + \ + for (int i = 0; i < 32; i++) \ + if (src_float[i] != restore_float[i]) \ + abort (); \ + } \ + \ + if (src_fcsr != restore_fcsr) \ + abort (); \ + \ + if (memcmp (src_fcc, restore_fcc, sizeof (src_fcc)) != 0) \ + abort (); \ + \ + if (src != restore) \ + abort (); + +#endif /* #ifdef __loongarch_soft_float */ + +#include_next <tst-gnu2-tls2.h> diff --git a/sysdeps/m68k/m680x0/fpu/w_exp10_compat.c b/sysdeps/m68k/m680x0/fpu/w_exp10_compat.c index 0d3e718626..350f2e4b4d 100644 --- a/sysdeps/m68k/m680x0/fpu/w_exp10_compat.c +++ b/sysdeps/m68k/m680x0/fpu/w_exp10_compat.c @@ -1,3 +1,8 @@ /* m68k provides an optimized __ieee754_exp10. */ -#define NO_COMPAT_NEEDED 1 -#include <math/w_exp10_compat.c> +#ifdef SHARED +# define NO_COMPAT_NEEDED 1 +# include <math/w_exp10_compat.c> +#else +# include <math-type-macros-double.h> +# include <w_exp10_template.c> +#endif diff --git a/sysdeps/m68k/m680x0/fpu/w_fmod_compat.c b/sysdeps/m68k/m680x0/fpu/w_fmod_compat.c index 527d4fbed2..57f38091e6 100644 --- a/sysdeps/m68k/m680x0/fpu/w_fmod_compat.c +++ b/sysdeps/m68k/m680x0/fpu/w_fmod_compat.c @@ -7,8 +7,9 @@ # define LIBM_SVID_COMPAT 1 # undef compat_symbol # define compat_symbol(a, b, c, d) -#endif #include <math/w_fmod_compat.c> -#ifdef SHARED libm_alias_double (__fmod_compat, fmod) +#else +#include <math-type-macros-double.h> +#include <w_fmod_template.c> #endif diff --git a/sysdeps/m68k/m680x0/fpu/w_fmodf_compat.c b/sysdeps/m68k/m680x0/fpu/w_fmodf_compat.c index 5043586b91..88db07f443 100644 --- a/sysdeps/m68k/m680x0/fpu/w_fmodf_compat.c +++ b/sysdeps/m68k/m680x0/fpu/w_fmodf_compat.c @@ -7,8 +7,9 @@ # define LIBM_SVID_COMPAT 1 # undef compat_symbol # define compat_symbol(a, b, c, d) -#endif -#include <math/w_fmodf_compat.c> -#ifdef SHARED +# include <math/w_fmodf_compat.c> libm_alias_float (__fmod_compat, fmod) +#else +#include <math-type-macros-float.h> +#include <w_fmod_template.c> #endif diff --git a/sysdeps/mach/hurd/bits/socket.h b/sysdeps/mach/hurd/bits/socket.h index 3e72f9fa93..b5eeac3731 100644 --- a/sysdeps/mach/hurd/bits/socket.h +++ b/sysdeps/mach/hurd/bits/socket.h @@ -153,7 +153,7 @@ enum __socket_type #include <bits/sockaddr.h> /* Structure describing a generic socket address. */ -struct sockaddr +struct __attribute_struct_may_alias__ sockaddr { __SOCKADDR_COMMON (sa_); /* Common data: address family and length. */ char sa_data[14]; /* Address data. */ @@ -170,7 +170,7 @@ struct sockaddr #define _SS_PADSIZE \ (_SS_SIZE - __SOCKADDR_COMMON_SIZE - sizeof (__ss_aligntype)) -struct sockaddr_storage +struct __attribute_struct_may_alias__ sockaddr_storage { __SOCKADDR_COMMON (ss_); /* Address family, etc. */ char __ss_padding[_SS_PADSIZE]; diff --git a/sysdeps/mach/hurd/i386/libm.abilist b/sysdeps/mach/hurd/i386/libm.abilist index 8f40ddb150..113daa4b44 100644 --- a/sysdeps/mach/hurd/i386/libm.abilist +++ b/sysdeps/mach/hurd/i386/libm.abilist @@ -1181,3 +1181,11 @@ GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F GLIBC_2.35 hypot F GLIBC_2.35 hypotf F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/mach/hurd/lsetxattr.c b/sysdeps/mach/hurd/lsetxattr.c index 3f5d335a1a..49487f78fb 100644 --- a/sysdeps/mach/hurd/lsetxattr.c +++ b/sysdeps/mach/hurd/lsetxattr.c @@ -32,5 +32,5 @@ lsetxattr (const char *path, const char *name, const void *value, size_t size, return -1; err = _hurd_xattr_set (port, name, value, size, flags); __mach_port_deallocate (__mach_task_self (), port); - return err ? __hurd_fail (err) : size; + return __hurd_fail (err); } diff --git a/sysdeps/mach/hurd/x86_64/libm.abilist b/sysdeps/mach/hurd/x86_64/libm.abilist index 5c762b937b..0c7ffd4e3f 100644 --- a/sysdeps/mach/hurd/x86_64/libm.abilist +++ b/sysdeps/mach/hurd/x86_64/libm.abilist @@ -1038,3 +1038,11 @@ GLIBC_2.38 ynf32x F GLIBC_2.38 ynf64 F GLIBC_2.38 ynf64x F GLIBC_2.38 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/or1k/fpu/fclrexcpt.c b/sysdeps/or1k/fpu/fclrexcpt.c new file mode 100644 index 0000000000..44224f9c24 --- /dev/null +++ b/sysdeps/or1k/fpu/fclrexcpt.c @@ -0,0 +1,44 @@ +/* Clear given exceptions in current floating-point environment. + OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +feclearexcept (int excepts) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + /* Mask out unsupported bits/exceptions. */ + excepts &= FE_ALL_EXCEPT; + + /* Read the complete control word. */ + _FPU_GETCW (cw); + + cw_new = cw & ~excepts; + + /* Put the new data in effect. */ + if (cw != cw_new) + _FPU_SETCW (cw_new); + + /* Success. */ + return 0; +} +libm_hidden_def (feclearexcept) diff --git a/sysdeps/or1k/fpu/fegetenv.c b/sysdeps/or1k/fpu/fegetenv.c new file mode 100644 index 0000000000..70c75aa0bf --- /dev/null +++ b/sysdeps/or1k/fpu/fegetenv.c @@ -0,0 +1,32 @@ +/* Store current floating-point environment. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +__fegetenv (fenv_t *envp) +{ + _FPU_GETCW (*envp); + + /* Success. */ + return 0; +} +libm_hidden_def (__fegetenv) +weak_alias (__fegetenv, fegetenv) +libm_hidden_weak (fegetenv) diff --git a/sysdeps/or1k/fpu/fegetmode.c b/sysdeps/or1k/fpu/fegetmode.c new file mode 100644 index 0000000000..7fffd2e0b5 --- /dev/null +++ b/sysdeps/or1k/fpu/fegetmode.c @@ -0,0 +1,29 @@ +/* Store current floating-point control modes. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fegetmode (femode_t *modep) +{ + _FPU_GETCW (*modep); + + /* Success. */ + return 0; +} diff --git a/sysdeps/or1k/fpu/fegetround.c b/sysdeps/or1k/fpu/fegetround.c new file mode 100644 index 0000000000..7e993b980a --- /dev/null +++ b/sysdeps/or1k/fpu/fegetround.c @@ -0,0 +1,29 @@ +/* Return current rounding direction. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <get-rounding-mode.h> + +int +__fegetround (void) +{ + return get_rounding_mode (); +} +libm_hidden_def (__fegetround) +weak_alias (__fegetround, fegetround) +libm_hidden_weak (fegetround) diff --git a/sysdeps/or1k/fpu/feholdexcpt.c b/sysdeps/or1k/fpu/feholdexcpt.c new file mode 100644 index 0000000000..0036e41ba2 --- /dev/null +++ b/sysdeps/or1k/fpu/feholdexcpt.c @@ -0,0 +1,33 @@ +/* Store current floating-point environment and clear exceptions. + OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +__feholdexcept (fenv_t *envp) +{ + libc_feholdexcept_or1k (envp); + + /* Success. */ + return 0; +} +libm_hidden_def (__feholdexcept) +weak_alias (__feholdexcept, feholdexcept) +libm_hidden_weak (feholdexcept) diff --git a/sysdeps/or1k/fpu/fenv_private.h b/sysdeps/or1k/fpu/fenv_private.h new file mode 100644 index 0000000000..4f401e7a5a --- /dev/null +++ b/sysdeps/or1k/fpu/fenv_private.h @@ -0,0 +1,199 @@ +/* Private floating point rounding and exceptions handling. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef OR1K_FENV_PRIVATE_H +#define OR1K_FENV_PRIVATE_H 1 + +#include <fenv.h> +#include <fpu_control.h> + +static __always_inline void +libc_feholdexcept_or1k (fenv_t *envp) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + /* Get and store the environment. */ + _FPU_GETCW (cw); + *envp = cw; + + /* Clear the exception status flags. */ + cw_new = cw & ~FE_ALL_EXCEPT; + + if (cw != cw_new) + _FPU_SETCW (cw_new); +} + +#define libc_feholdexcept libc_feholdexcept_or1k +#define libc_feholdexceptf libc_feholdexcept_or1k +#define libc_feholdexceptl libc_feholdexcept_or1k + +static __always_inline void +libc_fesetround_or1k (int round) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + _FPU_GETCW (cw); + cw_new = cw & ~_FPU_FPCSR_RM_MASK; + cw_new |= round; + if (cw != cw_new) + _FPU_SETCW (cw_new); +} + +#define libc_fesetround libc_fesetround_or1k +#define libc_fesetroundf libc_fesetround_or1k +#define libc_fesetroundl libc_fesetround_or1k + +static __always_inline void +libc_feholdexcept_setround_or1k (fenv_t *envp, int round) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + /* Get and store the environment. */ + _FPU_GETCW (cw); + *envp = cw; + + /* Clear the status flags and rounding mode. */ + cw_new = cw & ~(FE_ALL_EXCEPT | _FPU_FPCSR_RM_MASK); + + /* Set rounding mode. */ + cw_new |= round; + + if (cw != cw_new) + _FPU_SETCW (cw_new); +} + +#define libc_feholdexcept_setround libc_feholdexcept_setround_or1k +#define libc_feholdexcept_setroundf libc_feholdexcept_setround_or1k +#define libc_feholdexcept_setroundl libc_feholdexcept_setround_or1k + +static __always_inline int +libc_fetestexcept_or1k (int ex) +{ + fpu_control_t cw; + + /* Get current control word. */ + _FPU_GETCW (cw); + + /* Check if any of the queried exception flags are set. */ + return cw & ex & FE_ALL_EXCEPT; +} + +#define libc_fetestexcept libc_fetestexcept_or1k +#define libc_fetestexceptf libc_fetestexcept_or1k +#define libc_fetestexceptl libc_fetestexcept_or1k + +static __always_inline void +libc_fesetenv_or1k (const fenv_t *envp) +{ + if (envp == FE_DFL_ENV) + _FPU_SETCW (_FPU_DEFAULT); + else + _FPU_SETCW (*envp); +} + +#define libc_fesetenv libc_fesetenv_or1k +#define libc_fesetenvf libc_fesetenv_or1k +#define libc_fesetenvl libc_fesetenv_or1k +#define libc_feresetround_noex libc_fesetenv_or1k +#define libc_feresetround_noexf libc_fesetenv_or1k +#define libc_feresetround_noexl libc_fesetenv_or1k + +static __always_inline int +libc_feupdateenv_test_or1k (const fenv_t *envp, int ex) +{ + fpu_control_t cw; + fpu_control_t cw_new; + int excepts; + + /* Get current control word. */ + _FPU_GETCW (cw); + + /* Merge current exception flags with the passed fenv. */ + excepts = cw & FE_ALL_EXCEPT; + cw_new = (envp == FE_DFL_ENV ? _FPU_DEFAULT : *envp) | excepts; + + if (__glibc_unlikely (cw != cw_new)) + _FPU_SETCW (cw_new); + + /* Raise the exceptions if enabled in the new FP state. */ + if (__glibc_unlikely (excepts)) + __feraiseexcept (excepts); + + return excepts & ex; +} + +#define libc_feupdateenv_test libc_feupdateenv_test_or1k +#define libc_feupdateenv_testf libc_feupdateenv_test_or1k +#define libc_feupdateenv_testl libc_feupdateenv_test_or1k + +static __always_inline void +libc_feupdateenv_or1k (const fenv_t *envp) +{ + libc_feupdateenv_test_or1k (envp, 0); +} + +#define libc_feupdateenv libc_feupdateenv_or1k +#define libc_feupdateenvf libc_feupdateenv_or1k +#define libc_feupdateenvl libc_feupdateenv_or1k + +static __always_inline void +libc_feholdsetround_or1k (fenv_t *envp, int round) +{ + fpu_control_t cw; + + _FPU_GETCW (cw); + *envp = cw; + + /* Check whether rounding modes are different. */ + round = (cw ^ round) & _FPU_FPCSR_RM_MASK; + + /* Set new rounding mode if different. */ + if (__glibc_unlikely (round != 0)) + _FPU_SETCW (cw ^ round); +} + +#define libc_feholdsetround libc_feholdsetround_or1k +#define libc_feholdsetroundf libc_feholdsetround_or1k +#define libc_feholdsetroundl libc_feholdsetround_or1k + +static __always_inline void +libc_feresetround_or1k (fenv_t *envp) +{ + fpu_control_t cw; + int round; + + _FPU_GETCW (cw); + + /* Check whether rounding modes are different. */ + round = (*envp ^ cw) & _FPU_FPCSR_RM_MASK; + + /* Restore the rounding mode if it was changed. */ + if (__glibc_unlikely (round != 0)) + _FPU_SETCW (cw ^ round); +} + +#define libc_feresetround libc_feresetround_or1k +#define libc_feresetroundf libc_feresetround_or1k +#define libc_feresetroundl libc_feresetround_or1k + +#include_next <fenv_private.h> + +#endif diff --git a/sysdeps/or1k/fpu/fesetenv.c b/sysdeps/or1k/fpu/fesetenv.c new file mode 100644 index 0000000000..742ca719e0 --- /dev/null +++ b/sysdeps/or1k/fpu/fesetenv.c @@ -0,0 +1,32 @@ +/* Install given floating-point environment. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +__fesetenv (const fenv_t *envp) +{ + libc_fesetenv_or1k (envp); + + /* Success. */ + return 0; +} +libm_hidden_def (__fesetenv) +weak_alias (__fesetenv, fesetenv) +libm_hidden_weak (fesetenv) diff --git a/sysdeps/or1k/fpu/fesetexcept.c b/sysdeps/or1k/fpu/fesetexcept.c new file mode 100644 index 0000000000..43734eac18 --- /dev/null +++ b/sysdeps/or1k/fpu/fesetexcept.c @@ -0,0 +1,35 @@ +/* Set given exception flags. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fesetexcept (int excepts) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + _FPU_GETCW (cw); + cw_new = cw | (excepts & FE_ALL_EXCEPT); + if (cw != cw_new) + _FPU_SETCW (cw_new); + + /* Success. */ + return 0; +} diff --git a/sysdeps/or1k/fpu/fesetmode.c b/sysdeps/or1k/fpu/fesetmode.c new file mode 100644 index 0000000000..d4556927ce --- /dev/null +++ b/sysdeps/or1k/fpu/fesetmode.c @@ -0,0 +1,39 @@ +/* Install given floating-point control modes. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fesetmode (const femode_t *modep) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + _FPU_GETCW (cw); + cw_new = cw & ~_FPU_FPCSR_RM_MASK; + if (modep == FE_DFL_MODE) + cw_new |= (_FPU_DEFAULT & _FPU_FPCSR_RM_MASK); + else + cw_new |= (*modep & _FPU_FPCSR_RM_MASK); + if (cw != cw_new) + _FPU_SETCW (cw_new); + + /* Success. */ + return 0; +} diff --git a/sysdeps/or1k/fpu/fesetround.c b/sysdeps/or1k/fpu/fesetround.c new file mode 100644 index 0000000000..c2ada98f1b --- /dev/null +++ b/sysdeps/or1k/fpu/fesetround.c @@ -0,0 +1,39 @@ +/* Set current rounding direction. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +__fesetround (int round) +{ + switch (round) + { + case FE_TONEAREST: + case FE_TOWARDZERO: + case FE_DOWNWARD: + case FE_UPWARD: + libc_fesetround_or1k (round); + return 0; + default: + return round; /* A nonzero value. */ + } +} +libm_hidden_def (__fesetround) +weak_alias (__fesetround, fesetround) +libm_hidden_weak (fesetround) diff --git a/sysdeps/or1k/fpu/feupdateenv.c b/sysdeps/or1k/fpu/feupdateenv.c new file mode 100644 index 0000000000..3355bf6596 --- /dev/null +++ b/sysdeps/or1k/fpu/feupdateenv.c @@ -0,0 +1,33 @@ +/* Install given floating-point environment and raise exceptions. + OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +__feupdateenv (const fenv_t *envp) +{ + libc_feupdateenv_or1k (envp); + + /* Success. */ + return 0; +} +libm_hidden_def (__feupdateenv) +weak_alias (__feupdateenv, feupdateenv) +libm_hidden_weak (feupdateenv) diff --git a/sysdeps/or1k/fpu/fgetexcptflg.c b/sysdeps/or1k/fpu/fgetexcptflg.c new file mode 100644 index 0000000000..a954f6a2f1 --- /dev/null +++ b/sysdeps/or1k/fpu/fgetexcptflg.c @@ -0,0 +1,29 @@ +/* Store current state of exceptions. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +fegetexceptflag (fexcept_t *flagp, int excepts) +{ + *flagp = libc_fetestexcept_or1k (excepts); + + /* Success. */ + return 0; +} diff --git a/sysdeps/or1k/fpu/fix-fp-int-convert-overflow.h b/sysdeps/or1k/fpu/fix-fp-int-convert-overflow.h new file mode 100644 index 0000000000..78104289c0 --- /dev/null +++ b/sysdeps/or1k/fpu/fix-fp-int-convert-overflow.h @@ -0,0 +1,38 @@ +/* Fix for conversion of floating point to integer overflow. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef FIX_FP_INT_CONVERT_OVERFLOW_H +#define FIX_FP_INT_CONVERT_OVERFLOW_H 1 + +/* The generic libgcc2.c conversions from floating point to long long + may not raise the correct exceptions on overflow (and may raise + spurious "inexact" exceptions even in non-overflow cases, see + <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59412>). */ +#define FIX_FLT_LONG_CONVERT_OVERFLOW 0 +#define FIX_FLT_LLONG_CONVERT_OVERFLOW 1 + +#define FIX_DBL_LONG_CONVERT_OVERFLOW 0 +#define FIX_DBL_LLONG_CONVERT_OVERFLOW 1 + +#define FIX_LDBL_LONG_CONVERT_OVERFLOW 0 +#define FIX_LDBL_LLONG_CONVERT_OVERFLOW 0 + +#define FIX_FLT128_LONG_CONVERT_OVERFLOW 0 +#define FIX_FLT128_LLONG_CONVERT_OVERFLOW 0 + +#endif /* fix-fp-int-convert-overflow.h */ diff --git a/sysdeps/or1k/fpu/fraiseexcpt.c b/sysdeps/or1k/fpu/fraiseexcpt.c new file mode 100644 index 0000000000..bbacfd50bc --- /dev/null +++ b/sysdeps/or1k/fpu/fraiseexcpt.c @@ -0,0 +1,67 @@ +/* Raise given exceptions. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> +#include <float.h> +#include <math.h> + +int +__feraiseexcept (int excepts) +{ + if (excepts == 0) + return 0; + + /* Raise exceptions represented by EXPECTS. */ + + if (excepts & FE_INEXACT) + { + float d = 1.0, x = 3.0; + __asm__ volatile ("lf.div.s %0, %0, %1" : "+r" (d) : "r" (x)); + } + + if (excepts & FE_UNDERFLOW) + { + float d = FLT_MIN; + __asm__ volatile ("lf.mul.s %0, %0, %0" : "+r" (d)); + } + + if (excepts & FE_OVERFLOW) + { + float d = FLT_MAX; + __asm__ volatile ("lf.mul.s %0, %0, %0" : "+r" (d) : "r" (d)); + } + + if (excepts & FE_DIVBYZERO) + { + float d = 1.0, x = 0.0; + __asm__ volatile ("lf.div.s %0, %0, %1" : "+r" (d) : "r" (x)); + } + + if (excepts & FE_INVALID) + { + float d = HUGE_VAL, x = 0.0; + __asm__ volatile ("lf.mul.s %0, %1, %0" : "+r" (d) : "r" (x)); + } + + /* Success. */ + return 0; +} +libm_hidden_def (__feraiseexcept) +weak_alias (__feraiseexcept, feraiseexcept) +libm_hidden_weak (feraiseexcept) diff --git a/sysdeps/or1k/fpu/fsetexcptflg.c b/sysdeps/or1k/fpu/fsetexcptflg.c new file mode 100644 index 0000000000..c327e4c5d1 --- /dev/null +++ b/sysdeps/or1k/fpu/fsetexcptflg.c @@ -0,0 +1,43 @@ +/* Set floating-point environment exception handling. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fesetexceptflag (const fexcept_t *flagp, int excepts) +{ + fpu_control_t cw; + fpu_control_t cw_new; + + /* Get the current exceptions. */ + _FPU_GETCW (cw); + + /* Make sure the flags we want restored are legal. */ + excepts &= FE_ALL_EXCEPT; + + /* Now set selected bits from flagp. Note that we ignore all non-flag + bits from *flagp, so they don't matter. */ + cw_new = (cw & ~excepts) | (*flagp & excepts); + + if (cw != cw_new) + _FPU_SETCW (cw_new); + + /* Success. */ + return 0; +} diff --git a/sysdeps/or1k/fpu/ftestexcept.c b/sysdeps/or1k/fpu/ftestexcept.c new file mode 100644 index 0000000000..59f06afa22 --- /dev/null +++ b/sysdeps/or1k/fpu/ftestexcept.c @@ -0,0 +1,27 @@ +/* Test exception in current environment. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fenv_private.h> + +int +fetestexcept (int excepts) +{ + return libc_fetestexcept_or1k (excepts); +} +libm_hidden_def (fetestexcept) diff --git a/sysdeps/or1k/fpu/get-rounding-mode.h b/sysdeps/or1k/fpu/get-rounding-mode.h new file mode 100644 index 0000000000..a66d553be8 --- /dev/null +++ b/sysdeps/or1k/fpu/get-rounding-mode.h @@ -0,0 +1,38 @@ +/* Determine floating-point rounding mode within libc. OpenRISC version. + + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _OR1K_GET_ROUNDING_MODE_H +#define _OR1K_GET_ROUNDING_MODE_H 1 + +#include <fenv.h> +#include <fpu_control.h> + +/* Return the floating-point rounding mode. */ + +static inline int +get_rounding_mode (void) +{ + fpu_control_t cw; + + _FPU_GETCW (cw); + return cw & _FPU_FPCSR_RM_MASK; +} + +#endif /* get-rounding-mode.h */ diff --git a/sysdeps/or1k/fpu/libm-test-ulps b/sysdeps/or1k/fpu/libm-test-ulps new file mode 100644 index 0000000000..948ec01cdc --- /dev/null +++ b/sysdeps/or1k/fpu/libm-test-ulps @@ -0,0 +1,1115 @@ +# Begin of automatic generation + +# Maximal error of functions: +Function: "acos": +double: 1 +float: 1 + +Function: "acos_downward": +double: 1 +float: 1 + +Function: "acos_towardzero": +double: 1 +float: 1 + +Function: "acos_upward": +double: 1 +float: 1 + +Function: "acosh": +double: 2 +float: 2 + +Function: "acosh_downward": +double: 2 +float: 2 + +Function: "acosh_towardzero": +double: 2 +float: 2 + +Function: "acosh_upward": +double: 2 +float: 2 + +Function: "asin": +double: 1 +float: 1 + +Function: "asin_downward": +double: 1 +float: 1 + +Function: "asin_towardzero": +double: 1 +float: 1 + +Function: "asin_upward": +double: 2 +float: 1 + +Function: "asinh": +double: 2 +float: 2 + +Function: "asinh_downward": +double: 3 +float: 3 + +Function: "asinh_towardzero": +double: 2 +float: 2 + +Function: "asinh_upward": +double: 3 +float: 3 + +Function: "atan": +double: 1 +float: 1 + +Function: "atan2": +float: 2 + +Function: "atan2_downward": +double: 5 +float: 2 + +Function: "atan2_towardzero": +double: 5 +float: 2 + +Function: "atan2_upward": +double: 8 +float: 2 + +Function: "atan_downward": +double: 1 +float: 2 + +Function: "atan_towardzero": +double: 1 +float: 1 + +Function: "atan_upward": +double: 1 +float: 2 + +Function: "atanh": +double: 2 +float: 2 + +Function: "atanh_downward": +double: 3 +float: 3 + +Function: "atanh_towardzero": +double: 2 +float: 2 + +Function: "atanh_upward": +double: 3 +float: 3 + +Function: "cabs": +double: 1 + +Function: "cabs_downward": +double: 1 + +Function: "cabs_towardzero": +double: 1 + +Function: "cabs_upward": +double: 1 + +Function: Real part of "cacos": +double: 1 +float: 2 + +Function: Imaginary part of "cacos": +double: 2 +float: 2 + +Function: Real part of "cacos_downward": +double: 3 +float: 2 + +Function: Imaginary part of "cacos_downward": +double: 5 +float: 3 + +Function: Real part of "cacos_towardzero": +double: 3 +float: 2 + +Function: Imaginary part of "cacos_towardzero": +double: 4 +float: 2 + +Function: Real part of "cacos_upward": +double: 2 +float: 2 + +Function: Imaginary part of "cacos_upward": +double: 5 +float: 7 + +Function: Real part of "cacosh": +double: 2 +float: 2 + +Function: Imaginary part of "cacosh": +double: 1 +float: 2 + +Function: Real part of "cacosh_downward": +double: 4 +float: 2 + +Function: Imaginary part of "cacosh_downward": +double: 3 +float: 3 + +Function: Real part of "cacosh_towardzero": +double: 4 +float: 2 + +Function: Imaginary part of "cacosh_towardzero": +double: 3 +float: 2 + +Function: Real part of "cacosh_upward": +double: 4 +float: 3 + +Function: Imaginary part of "cacosh_upward": +double: 3 +float: 2 + +Function: "carg": +float: 1 + +Function: "carg_downward": +double: 5 +float: 2 + +Function: "carg_towardzero": +double: 5 +float: 2 + +Function: "carg_upward": +double: 8 +float: 2 + +Function: Real part of "casin": +double: 1 +float: 1 + +Function: Imaginary part of "casin": +double: 2 +float: 2 + +Function: Real part of "casin_downward": +double: 3 +float: 2 + +Function: Imaginary part of "casin_downward": +double: 5 +float: 3 + +Function: Real part of "casin_towardzero": +double: 3 +float: 1 + +Function: Imaginary part of "casin_towardzero": +double: 4 +float: 2 + +Function: Real part of "casin_upward": +double: 3 +float: 2 + +Function: Imaginary part of "casin_upward": +double: 5 +float: 7 + +Function: Real part of "casinh": +double: 2 +float: 2 + +Function: Imaginary part of "casinh": +double: 1 +float: 1 + +Function: Real part of "casinh_downward": +double: 5 +float: 3 + +Function: Imaginary part of "casinh_downward": +double: 3 +float: 2 + +Function: Real part of "casinh_towardzero": +double: 4 +float: 2 + +Function: Imaginary part of "casinh_towardzero": +double: 3 +float: 1 + +Function: Real part of "casinh_upward": +double: 5 +float: 7 + +Function: Imaginary part of "casinh_upward": +double: 3 +float: 2 + +Function: Real part of "catan": +double: 1 +float: 1 + +Function: Imaginary part of "catan": +double: 1 +float: 1 + +Function: Real part of "catan_downward": +double: 1 +float: 2 + +Function: Imaginary part of "catan_downward": +double: 2 +float: 2 + +Function: Real part of "catan_towardzero": +double: 1 +float: 2 + +Function: Imaginary part of "catan_towardzero": +double: 2 +float: 2 + +Function: Real part of "catan_upward": +double: 2 +float: 1 + +Function: Imaginary part of "catan_upward": +double: 2 +float: 2 + +Function: Real part of "catanh": +double: 1 +float: 1 + +Function: Imaginary part of "catanh": +double: 1 +float: 1 + +Function: Real part of "catanh_downward": +double: 2 +float: 2 + +Function: Imaginary part of "catanh_downward": +double: 1 +float: 2 + +Function: Real part of "catanh_towardzero": +double: 2 +float: 2 + +Function: Imaginary part of "catanh_towardzero": +double: 1 +float: 2 + +Function: Real part of "catanh_upward": +double: 4 +float: 4 + +Function: Imaginary part of "catanh_upward": +double: 2 +float: 1 + +Function: "cbrt": +double: 4 +float: 1 + +Function: "cbrt_downward": +double: 4 +float: 1 + +Function: "cbrt_towardzero": +double: 3 +float: 1 + +Function: "cbrt_upward": +double: 5 +float: 1 + +Function: Real part of "ccos": +double: 1 +float: 1 + +Function: Imaginary part of "ccos": +double: 1 +float: 1 + +Function: Real part of "ccos_downward": +double: 3 +float: 1 + +Function: Imaginary part of "ccos_downward": +double: 3 +float: 3 + +Function: Real part of "ccos_towardzero": +double: 3 +float: 2 + +Function: Imaginary part of "ccos_towardzero": +double: 3 +float: 3 + +Function: Real part of "ccos_upward": +double: 1 +float: 2 + +Function: Imaginary part of "ccos_upward": +double: 2 +float: 2 + +Function: Real part of "ccosh": +double: 2 +float: 1 + +Function: Imaginary part of "ccosh": +double: 1 +float: 1 + +Function: Real part of "ccosh_downward": +double: 3 +float: 2 + +Function: Imaginary part of "ccosh_downward": +double: 3 +float: 3 + +Function: Real part of "ccosh_towardzero": +double: 3 +float: 3 + +Function: Imaginary part of "ccosh_towardzero": +double: 3 +float: 3 + +Function: Real part of "ccosh_upward": +double: 1 +float: 2 + +Function: Imaginary part of "ccosh_upward": +double: 2 +float: 2 + +Function: Real part of "cexp": +double: 2 +float: 1 + +Function: Imaginary part of "cexp": +double: 1 +float: 2 + +Function: Real part of "cexp_downward": +double: 4 +float: 2 + +Function: Imaginary part of "cexp_downward": +double: 3 +float: 3 + +Function: Real part of "cexp_towardzero": +double: 4 +float: 2 + +Function: Imaginary part of "cexp_towardzero": +double: 3 +float: 3 + +Function: Real part of "cexp_upward": +double: 2 +float: 2 + +Function: Imaginary part of "cexp_upward": +double: 3 +float: 2 + +Function: Real part of "clog": +double: 3 +float: 3 + +Function: Imaginary part of "clog": +double: 1 +float: 1 + +Function: Real part of "clog10": +double: 3 +float: 4 + +Function: Imaginary part of "clog10": +double: 2 +float: 2 + +Function: Real part of "clog10_downward": +double: 7 +float: 5 + +Function: Imaginary part of "clog10_downward": +double: 8 +float: 4 + +Function: Real part of "clog10_towardzero": +double: 5 +float: 5 + +Function: Imaginary part of "clog10_towardzero": +double: 8 +float: 4 + +Function: Real part of "clog10_upward": +double: 6 +float: 5 + +Function: Imaginary part of "clog10_upward": +double: 5 +float: 4 + +Function: Real part of "clog_downward": +double: 4 +float: 3 + +Function: Imaginary part of "clog_downward": +double: 5 +float: 2 + +Function: Real part of "clog_towardzero": +double: 8 +float: 4 + +Function: Imaginary part of "clog_towardzero": +double: 5 +float: 3 + +Function: Real part of "clog_upward": +double: 8 +float: 3 + +Function: Imaginary part of "clog_upward": +double: 8 +float: 2 + +Function: "cos": +double: 1 +float: 1 + +Function: "cos_downward": +double: 1 +float: 1 + +Function: "cos_towardzero": +double: 4 +float: 1 + +Function: "cos_upward": +double: 4 +float: 1 + +Function: "cosh": +double: 2 +float: 2 + +Function: "cosh_downward": +double: 3 +float: 1 + +Function: "cosh_towardzero": +double: 3 +float: 1 + +Function: "cosh_upward": +double: 2 +float: 2 + +Function: Real part of "cpow": +double: 2 +float: 5 + +Function: Imaginary part of "cpow": +float: 2 + +Function: Real part of "cpow_downward": +double: 5 +float: 8 + +Function: Imaginary part of "cpow_downward": +double: 2 +float: 2 + +Function: Real part of "cpow_towardzero": +double: 5 +float: 8 + +Function: Imaginary part of "cpow_towardzero": +double: 2 +float: 2 + +Function: Real part of "cpow_upward": +double: 4 +float: 1 + +Function: Imaginary part of "cpow_upward": +double: 2 +float: 2 + +Function: Real part of "csin": +double: 1 +float: 1 + +Function: Real part of "csin_downward": +double: 3 +float: 3 + +Function: Imaginary part of "csin_downward": +double: 3 +float: 1 + +Function: Real part of "csin_towardzero": +double: 3 +float: 3 + +Function: Imaginary part of "csin_towardzero": +double: 3 +float: 1 + +Function: Real part of "csin_upward": +double: 2 +float: 2 + +Function: Imaginary part of "csin_upward": +double: 1 +float: 2 + +Function: Real part of "csinh": +float: 1 + +Function: Imaginary part of "csinh": +double: 1 +float: 1 + +Function: Real part of "csinh_downward": +double: 3 +float: 1 + +Function: Imaginary part of "csinh_downward": +double: 3 +float: 3 + +Function: Real part of "csinh_towardzero": +double: 3 +float: 2 + +Function: Imaginary part of "csinh_towardzero": +double: 3 +float: 3 + +Function: Real part of "csinh_upward": +double: 1 +float: 2 + +Function: Imaginary part of "csinh_upward": +double: 2 +float: 2 + +Function: Real part of "csqrt": +double: 2 +float: 2 + +Function: Imaginary part of "csqrt": +double: 2 +float: 2 + +Function: Real part of "csqrt_downward": +double: 5 +float: 4 + +Function: Imaginary part of "csqrt_downward": +double: 4 +float: 3 + +Function: Real part of "csqrt_towardzero": +double: 4 +float: 3 + +Function: Imaginary part of "csqrt_towardzero": +double: 4 +float: 3 + +Function: Real part of "csqrt_upward": +double: 5 +float: 4 + +Function: Imaginary part of "csqrt_upward": +double: 3 +float: 3 + +Function: Real part of "ctan": +double: 1 +float: 1 + +Function: Imaginary part of "ctan": +double: 2 +float: 2 + +Function: Real part of "ctan_downward": +double: 6 +float: 5 + +Function: Imaginary part of "ctan_downward": +double: 3 +float: 2 + +Function: Real part of "ctan_towardzero": +double: 5 +float: 3 + +Function: Imaginary part of "ctan_towardzero": +double: 3 +float: 2 + +Function: Real part of "ctan_upward": +double: 2 +float: 4 + +Function: Imaginary part of "ctan_upward": +double: 2 +float: 1 + +Function: Real part of "ctanh": +double: 2 +float: 2 + +Function: Imaginary part of "ctanh": +double: 2 +float: 2 + +Function: Real part of "ctanh_downward": +double: 3 +float: 2 + +Function: Imaginary part of "ctanh_downward": +double: 6 +float: 5 + +Function: Real part of "ctanh_towardzero": +double: 3 +float: 2 + +Function: Imaginary part of "ctanh_towardzero": +double: 5 +float: 3 + +Function: Real part of "ctanh_upward": +double: 2 +float: 2 + +Function: Imaginary part of "ctanh_upward": +double: 2 +float: 3 + +Function: "erf": +double: 1 +float: 1 + +Function: "erf_downward": +double: 1 +float: 1 + +Function: "erf_towardzero": +double: 1 +float: 1 + +Function: "erf_upward": +double: 1 +float: 1 + +Function: "erfc": +double: 5 +float: 3 + +Function: "erfc_downward": +double: 5 +float: 6 + +Function: "erfc_towardzero": +double: 3 +float: 4 + +Function: "erfc_upward": +double: 5 +float: 6 + +Function: "exp": +double: 1 +float: 1 + +Function: "exp10": +double: 2 +float: 1 + +Function: "exp10_downward": +double: 3 +float: 1 + +Function: "exp10_towardzero": +double: 3 +float: 1 + +Function: "exp10_upward": +double: 2 +float: 1 + +Function: "exp2": +double: 1 + +Function: "exp2_downward": +double: 1 + +Function: "exp2_towardzero": +double: 1 + +Function: "exp2_upward": +double: 1 +float: 1 + +Function: "exp_downward": +double: 1 +float: 1 + +Function: "exp_towardzero": +double: 1 +float: 1 + +Function: "exp_upward": +double: 1 +float: 1 + +Function: "expm1": +double: 1 +float: 1 + +Function: "expm1_downward": +double: 1 +float: 1 + +Function: "expm1_towardzero": +double: 1 +float: 2 + +Function: "expm1_upward": +double: 1 +float: 1 + +Function: "gamma": +double: 4 +float: 7 + +Function: "gamma_downward": +double: 7 +float: 7 + +Function: "gamma_towardzero": +double: 7 +float: 7 + +Function: "gamma_upward": +double: 5 +float: 6 + +Function: "hypot": +double: 1 +float: 1 + +Function: "hypot_downward": +double: 1 + +Function: "hypot_towardzero": +double: 1 + +Function: "hypot_upward": +double: 1 + +Function: "j0": +double: 2 +float: 9 + +Function: "j0_downward": +double: 5 +float: 9 + +Function: "j0_towardzero": +double: 6 +float: 9 + +Function: "j0_upward": +double: 9 +float: 9 + +Function: "j1": +double: 4 +float: 9 + +Function: "j1_downward": +double: 5 +float: 8 + +Function: "j1_towardzero": +double: 4 +float: 8 + +Function: "j1_upward": +double: 9 +float: 9 + +Function: "jn": +double: 4 +float: 4 + +Function: "jn_downward": +double: 7 +float: 9 + +Function: "jn_towardzero": +double: 7 +float: 9 + +Function: "jn_upward": +double: 7 +float: 9 + +Function: "lgamma": +double: 4 +float: 7 + +Function: "lgamma_downward": +double: 7 +float: 7 + +Function: "lgamma_towardzero": +double: 7 +float: 7 + +Function: "lgamma_upward": +double: 5 +float: 6 + +Function: "log10": +double: 2 +float: 2 + +Function: "log10_downward": +double: 2 +float: 3 + +Function: "log10_towardzero": +double: 2 +float: 1 + +Function: "log10_upward": +double: 2 +float: 2 + +Function: "log1p": +double: 1 +float: 1 + +Function: "log1p_downward": +double: 2 +float: 2 + +Function: "log1p_towardzero": +double: 2 +float: 2 + +Function: "log1p_upward": +double: 2 +float: 2 + +Function: "log2": +float: 1 + +Function: "log2_downward": +double: 1 + +Function: "log2_towardzero": +double: 1 + +Function: "log2_upward": +double: 1 + +Function: "pow": +double: 1 + +Function: "pow_downward": +double: 1 +float: 1 + +Function: "pow_towardzero": +double: 1 +float: 1 + +Function: "pow_upward": +double: 1 +float: 1 + +Function: "sin": +double: 1 +float: 1 + +Function: "sin_downward": +double: 4 +float: 1 + +Function: "sin_towardzero": +double: 3 +float: 1 + +Function: "sin_upward": +double: 7 +float: 1 + +Function: "sincos": +double: 1 + +Function: "sincos_downward": +double: 1 +float: 1 + +Function: "sincos_towardzero": +double: 4 +float: 1 + +Function: "sincos_upward": +double: 1 +float: 1 + +Function: "sinh": +double: 2 +float: 2 + +Function: "sinh_downward": +double: 3 +float: 3 + +Function: "sinh_towardzero": +double: 3 +float: 2 + +Function: "sinh_upward": +double: 3 +float: 3 + +Function: "tan": +float: 1 + +Function: "tan_downward": +double: 1 +float: 2 + +Function: "tan_towardzero": +double: 1 +float: 1 + +Function: "tan_upward": +double: 1 +float: 1 + +Function: "tanh": +double: 2 +float: 2 + +Function: "tanh_downward": +double: 3 +float: 3 + +Function: "tanh_towardzero": +double: 2 +float: 2 + +Function: "tanh_upward": +double: 3 +float: 3 + +Function: "tgamma": +double: 9 +float: 8 + +Function: "tgamma_downward": +double: 9 +float: 9 + +Function: "tgamma_towardzero": +double: 9 +float: 8 + +Function: "tgamma_upward": +double: 9 +float: 8 + +Function: "y0": +double: 3 +float: 9 + +Function: "y0_downward": +double: 3 +float: 9 + +Function: "y0_towardzero": +double: 4 +float: 9 + +Function: "y0_upward": +double: 3 +float: 9 + +Function: "y1": +double: 3 +float: 9 + +Function: "y1_downward": +double: 6 +float: 9 + +Function: "y1_towardzero": +double: 3 +float: 9 + +Function: "y1_upward": +double: 7 +float: 9 + +Function: "yn": +double: 3 +float: 3 + +Function: "yn_downward": +double: 6 +float: 8 + +Function: "yn_towardzero": +double: 8 +float: 8 + +Function: "yn_upward": +double: 8 +float: 8 + +# end of automatic generation diff --git a/sysdeps/or1k/fpu/libm-test-ulps-name b/sysdeps/or1k/fpu/libm-test-ulps-name new file mode 100644 index 0000000000..7f72f7a873 --- /dev/null +++ b/sysdeps/or1k/fpu/libm-test-ulps-name @@ -0,0 +1 @@ +OpenRISC hard-float diff --git a/sysdeps/or1k/fpu_control.h b/sysdeps/or1k/fpu_control.h new file mode 100644 index 0000000000..f89364a31e --- /dev/null +++ b/sysdeps/or1k/fpu_control.h @@ -0,0 +1,89 @@ +/* FPU control word bits. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _FPU_CONTROL_H +#define _FPU_CONTROL_H + +#ifndef __or1k_hard_float__ + +# define _FPU_RESERVED 0xffffffff +# define _FPU_DEFAULT 0x00000000 +# define _FPU_GETCW(cw) (cw) = 0 +# define _FPU_SETCW(cw) (void) (cw) + +#else /* __or1k_hard_float__ */ + +/* Layout of FPCSR: + + The bits of the FPCSR are defined as follows, this should help + explain how the masks below have come to be. + + +-----------+----------------------------+-----+----+ + | 32 - 12 | 11 10 9 8 7 6 5 4 3 | 2-1 | 0 | + +-----------+----------------------------+-----+----+ + | Reserved | DZ IN IV IX Z QN SN UN OV | RM | EE | + +-----------+----------------------------+-----+----+ + + Exception flags: + + DZ - divide by zero flag. + IN - infinite flag. + IV - invalid flag. + IX - inexact flag. + Z - zero flag. + QN - qnan flag. + SN - snan flag. + UN - underflow flag. + OV - overflow flag. + + Rounding modes: + + The FPCSR bits 2-1 labeled above as RM specify the rounding mode. + + 00 - round to nearest + 01 - round to zero + 10 - round to positive infinity + 11 - round to negative infinity + + Enabling exceptions: + + EE - set to enable FPU exceptions. + + */ + +# define _FPU_RESERVED 0xfffff000 +/* Default: rounding to nearest with exceptions disabled. */ +# define _FPU_DEFAULT 0 +/* IEEE: Same as above with exceptions enabled. */ +# define _FPU_IEEE (_FPU_DEFAULT | 1) + +# define _FPU_FPCSR_RM_MASK (0x3 << 1) + +/* Macros for accessing the hardware control word. */ +# define _FPU_GETCW(cw) __asm__ volatile ("l.mfspr %0,r0,20" : "=r" (cw)) +# define _FPU_SETCW(cw) __asm__ volatile ("l.mtspr r0,%0,20" : : "r" (cw)) + +#endif /* __or1k_hard_float__ */ + +/* Type of the control word. */ +typedef unsigned int fpu_control_t; + +/* Default control word set at startup. */ +extern fpu_control_t __fpu_control; + +#endif /* fpu_control.h */ diff --git a/sysdeps/or1k/libm-test-ulps-name b/sysdeps/or1k/libm-test-ulps-name deleted file mode 100644 index 0af6591fd9..0000000000 --- a/sysdeps/or1k/libm-test-ulps-name +++ /dev/null @@ -1 +0,0 @@ -OpenRISC diff --git a/sysdeps/or1k/math-tests-snan-payload.h b/sysdeps/or1k/math-tests-snan-payload.h new file mode 100644 index 0000000000..62467a371c --- /dev/null +++ b/sysdeps/or1k/math-tests-snan-payload.h @@ -0,0 +1,26 @@ +/* Configuration for math tests: sNaN payloads. OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef OR1K_MATH_TESTS_SNAN_PAYLOAD_H +#define OR1K_MATH_TESTS_SNAN_PAYLOAD_H 1 + +/* OpenRISC floating-point instructions do not preserve NaN + payloads. */ +#define SNAN_TESTS_PRESERVE_PAYLOAD 0 + +#endif /* math-tests-snan-payload.h */ diff --git a/sysdeps/or1k/math-tests-trap.h b/sysdeps/or1k/math-tests-trap.h new file mode 100644 index 0000000000..a95b42d66d --- /dev/null +++ b/sysdeps/or1k/math-tests-trap.h @@ -0,0 +1,27 @@ +/* Configuration for math tests: support for enabling exception traps. + OpenRISC version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef OR1K_MATH_TESTS_TRAP_H +#define OR1K_MATH_TESTS_TRAP_H 1 + +#include <fenv.h> + +#define EXCEPTION_ENABLE_SUPPORTED(EXCEPT) ((EXCEPT) == 0) + +#endif /* math-tests-trap.h */ diff --git a/sysdeps/or1k/libm-test-ulps b/sysdeps/or1k/nofpu/libm-test-ulps index 785bae70d0..785bae70d0 100644 --- a/sysdeps/or1k/libm-test-ulps +++ b/sysdeps/or1k/nofpu/libm-test-ulps diff --git a/sysdeps/or1k/nofpu/libm-test-ulps-name b/sysdeps/or1k/nofpu/libm-test-ulps-name new file mode 100644 index 0000000000..76c66a0e53 --- /dev/null +++ b/sysdeps/or1k/nofpu/libm-test-ulps-name @@ -0,0 +1 @@ +OpenRISC soft-float diff --git a/sysdeps/or1k/sfp-machine.h b/sysdeps/or1k/sfp-machine.h index d17fd37730..70aa421ef0 100644 --- a/sysdeps/or1k/sfp-machine.h +++ b/sysdeps/or1k/sfp-machine.h @@ -90,4 +90,21 @@ #define FP_ROUNDMODE (_fpcsr & FP_RND_MASK) +#ifdef __or1k_hard_float__ +#define FP_INIT_ROUNDMODE \ +do { \ + __asm__ volatile ("l.mfspr %0,r0,20" : "=r" (_fpcsr)); \ +} while (0) + +#define FP_HANDLE_EXCEPTIONS \ +do { \ + if (__builtin_expect (_fex, 0)) \ + { \ + _fpcsr &= ~FP_EX_ALL; \ + _fpcsr |= _fex; \ + __asm__ volatile ("l.mtspr r0,%0,20" : : "r" (_fpcsr)); \ + } \ +} while (0) +#endif /* __or1k_hard_float__ */ + #define _FP_TININESS_AFTER_ROUNDING 0 diff --git a/sysdeps/powerpc/fpu/fenv_libc.h b/sysdeps/powerpc/fpu/fenv_libc.h index f9167056a8..0a06e4486b 100644 --- a/sysdeps/powerpc/fpu/fenv_libc.h +++ b/sysdeps/powerpc/fpu/fenv_libc.h @@ -182,19 +182,13 @@ __fesetround_inline (int round) return 0; } -/* Same as __fesetround_inline, however without runtime check to use DFP - mtfsfi syntax (as relax_fenv_state) or if round value is valid. */ +/* Same as __fesetround_inline, and it also disable the floating-point + inexact execption (bit 60 - XE, assuming NI is 0). It does not check + if ROUND is a valid value. */ static inline void -__fesetround_inline_nocheck (const int round) +__fesetround_inline_disable_inexact (const int round) { -#ifdef _ARCH_PWR9 - __fe_mffscrn (round); -#else - if (__glibc_likely (GLRO(dl_hwcap2) & PPC_FEATURE2_ARCH_3_00)) - __fe_mffscrn (round); - else - asm volatile ("mtfsfi 7,%0" : : "n" (round)); -#endif + asm volatile ("mtfsfi 7,%0" : : "n" (round)); } #define FPSCR_MASK(bit) (1 << (31 - (bit))) diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps index dbf71d8194..78e9a9ad6e 100644 --- a/sysdeps/powerpc/fpu/libm-test-ulps +++ b/sysdeps/powerpc/fpu/libm-test-ulps @@ -1506,6 +1506,30 @@ float: 3 float128: 1 ldouble: 4 +Function: "log2p1": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + +Function: "log2p1_downward": +double: 2 +float: 2 +float128: 3 +ldouble: 4 + +Function: "log2p1_towardzero": +double: 2 +float: 2 +float128: 2 +ldouble: 4 + +Function: "log2p1_upward": +double: 1 +float: 2 +float128: 2 +ldouble: 4 + Function: "log_downward": float: 2 float128: 1 diff --git a/sysdeps/powerpc/fpu/round_to_integer.h b/sysdeps/powerpc/fpu/round_to_integer.h index b68833640f..6996519c61 100644 --- a/sysdeps/powerpc/fpu/round_to_integer.h +++ b/sysdeps/powerpc/fpu/round_to_integer.h @@ -42,14 +42,14 @@ set_fenv_mode (enum round_mode mode) switch (mode) { case CEIL: - __fesetround_inline_nocheck (FE_UPWARD); + __fesetround_inline_disable_inexact (FE_UPWARD); break; case FLOOR: - __fesetround_inline_nocheck (FE_DOWNWARD); + __fesetround_inline_disable_inexact (FE_DOWNWARD); break; case TRUNC: case ROUND: - __fesetround_inline_nocheck (FE_TOWARDZERO); + __fesetround_inline_disable_inexact (FE_TOWARDZERO); break; case NEARBYINT: /* Disable FE_INEXACT exception */ diff --git a/sysdeps/powerpc/nofpu/Makefile b/sysdeps/powerpc/nofpu/Makefile index fa462339ba..96657c9676 100644 --- a/sysdeps/powerpc/nofpu/Makefile +++ b/sysdeps/powerpc/nofpu/Makefile @@ -54,4 +54,5 @@ CFLAGS-s_fmaximum_magl.c += -fno-builtin-fabsl CFLAGS-s_fmaximum_mag_numl.c += -fno-builtin-fabsl CFLAGS-s_fminimum_magl.c += -fno-builtin-fabsl CFLAGS-s_fminimum_mag_numl.c += -fno-builtin-fabsl +CFLAGS-s_log2p1l.c += -fno-builtin-fabsl endif diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-power6.c b/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-power6.c index 5b1a5a8cf0..3c279ddd1a 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-power6.c +++ b/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-power6.c @@ -1,2 +1,6 @@ +#include <libm-alias-float.h> + #define __llrintf __llrintf_power6 +#undef libm_alias_float +#define libm_alias_float(a, b) #include <sysdeps/powerpc/powerpc32/fpu/s_llrintf.c> diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-ppc32.c b/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-ppc32.c index 061962b84d..bce8f4564f 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-ppc32.c +++ b/sysdeps/powerpc/powerpc32/power4/fpu/multiarch/s_llrintf-ppc32.c @@ -1,2 +1,6 @@ +#include <libm-alias-float.h> + #define __llrintf __llrintf_ppc32 +#undef libm_alias_float +#define libm_alias_float(a, b) #include <sysdeps/powerpc/powerpc32/fpu/s_llrintf.c> diff --git a/sysdeps/powerpc/powerpc64/configure b/sysdeps/powerpc/powerpc64/configure index 6d8153ed50..9df2ceaa1f 100644 --- a/sysdeps/powerpc/powerpc64/configure +++ b/sysdeps/powerpc/powerpc64/configure @@ -12,7 +12,7 @@ then : else $as_nop libc_cv_overlapping_opd=no echo 'void foo (void) {}' > conftest.c -if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -S conftest.c -o conftest.s 1>&5' +if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $libc_cv_cc_submachine -S conftest.c -o conftest.s 1>&5' { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 (eval $ac_try) 2>&5 ac_status=$? @@ -51,7 +51,7 @@ int bar (void); int foo (void) { return bar () + 1; } EOF libc_cv_ppc64_notoc=no - if { ac_try='${CC-cc} $libc_cv_cc_submachine $CFLAGS $CPPFLAGS -S -o conftest.s conftest.c' + if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $libc_cv_cc_submachine -S -o conftest.s conftest.c' { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 (eval $ac_try) 2>&5 ac_status=$? diff --git a/sysdeps/powerpc/powerpc64/configure.ac b/sysdeps/powerpc/powerpc64/configure.ac index 575745af3e..225d66ef1a 100644 --- a/sysdeps/powerpc/powerpc64/configure.ac +++ b/sysdeps/powerpc/powerpc64/configure.ac @@ -7,7 +7,7 @@ AC_CACHE_CHECK(for support for overlapping .opd entries, libc_cv_overlapping_opd, [dnl libc_cv_overlapping_opd=no echo 'void foo (void) {}' > conftest.c -if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS -S conftest.c -o conftest.s 1>&AS_MESSAGE_LOG_FD); then +if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $libc_cv_cc_submachine -S conftest.c -o conftest.s 1>&AS_MESSAGE_LOG_FD); then changequote(,)dnl if grep '\.TOC\.@tocbase' conftest.s > /dev/null; then if grep '\.TOC\.@tocbase[ ]*,[ ]*0' conftest.s > /dev/null; then @@ -35,7 +35,7 @@ int bar (void); int foo (void) { return bar () + 1; } EOF libc_cv_ppc64_notoc=no - if AC_TRY_COMMAND([${CC-cc} $libc_cv_cc_submachine $CFLAGS $CPPFLAGS -S -o conftest.s conftest.c]) \ + if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $libc_cv_cc_submachine -S -o conftest.s conftest.c]) \ && AC_TRY_COMMAND([grep -q -E 'bar@notoc' conftest.s]) then libc_cv_ppc64_notoc=yes diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S new file mode 100644 index 0000000000..d4ba76acae --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S @@ -0,0 +1,271 @@ +/* Optimized strncmp implementation for PowerPC64/POWER10. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Implements the function + + int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment for first 32 bytes and uses + vectorised loops after that. */ + +#ifndef STRNCMP +# define STRNCMP strncmp +#endif + +/* TODO: Change this to actual instructions when minimum binutils is upgraded + to 2.27. Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +#define LXVP(xtp,dq,ra) \ + .long(((6)<<(32-6)) \ + | ((((xtp)-32)>>1)<<(32-10)) \ + | ((1)<<(32-11)) \ + | ((ra)<<(32-16)) \ + | dq) + +#define COMPARE_16(vreg1,vreg2,offset) \ + lxv vreg1+32,offset(r3); \ + lxv vreg2+32,offset(r4); \ + vcmpnezb. v7,vreg1,vreg2; \ + bne cr6,L(different); \ + cmpldi cr7,r5,16; \ + ble cr7,L(ret0); \ + addi r5,r5,-16; + +#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ + LXVP(vreg1+32,offset,r3); \ + LXVP(vreg2+32,offset,r4); \ + vcmpnezb. v7,vreg1+1,vreg2+1; \ + bne cr6,L(label1); \ + vcmpnezb. v7,vreg1,vreg2; \ + bne cr6,L(label2); \ + cmpldi cr7,r5,32; \ + ble cr7,L(ret0); \ + addi r5,r5,-32; + +#define TAIL_FIRST_16B(vreg1,vreg2) \ + vctzlsbb r6,v7; \ + cmpld cr7,r5,r6; \ + ble cr7,L(ret0); \ + vextubrx r5,r6,vreg1; \ + vextubrx r4,r6,vreg2; \ + subf r3,r4,r5; \ + blr; + +#define TAIL_SECOND_16B(vreg1,vreg2) \ + vctzlsbb r6,v7; \ + addi r0,r6,16; \ + cmpld cr7,r5,r0; \ + ble cr7,L(ret0); \ + vextubrx r5,r6,vreg1; \ + vextubrx r4,r6,vreg2; \ + subf r3,r4,r5; \ + blr; + +#define CHECK_N_BYTES(reg1,reg2,len_reg) \ + sldi r6,len_reg,56; \ + lxvl 32+v4,reg1,r6; \ + lxvl 32+v5,reg2,r6; \ + add reg1,reg1,len_reg; \ + add reg2,reg2,len_reg; \ + vcmpnezb v7,v4,v5; \ + vctzlsbb r6,v7; \ + cmpld cr7,r6,len_reg; \ + blt cr7,L(different); \ + cmpld cr7,r5,len_reg; \ + ble cr7,L(ret0); \ + sub r5,r5,len_reg; \ + + /* TODO: change this to .machine power10 when the minimum required + binutils allows it. */ + .machine power9 +ENTRY_TOCLESS (STRNCMP, 4) + /* Check if size is 0. */ + cmpdi cr0,r5,0 + beq cr0,L(ret0) + andi. r7,r3,4095 + andi. r8,r4,4095 + cmpldi cr0,r7,4096-16 + cmpldi cr1,r8,4096-16 + bgt cr0,L(crosses) + bgt cr1,L(crosses) + COMPARE_16(v4,v5,0) + addi r3,r3,16 + addi r4,r4,16 + +L(crosses): + andi. r7,r3,15 + subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ + andi. r9,r4,15 + subfic r8,r9,16 /* r8(nalign2) = 16 - (str2 & 15). */ + cmpld cr7,r7,r8 + beq cr7,L(same_aligned) + blt cr7,L(nalign1_min) + + /* nalign2 is minimum and s2 pointer is aligned. */ + CHECK_N_BYTES(r3,r4,r8) + /* Are we on the 64B hunk which crosses a page? */ + andi. r10,r3,63 /* Determine offset into 64B hunk. */ + andi. r8,r3,15 /* The offset into the 16B hunk. */ + neg r7,r3 + andi. r9,r7,15 /* Number of bytes after a 16B cross. */ + rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ + beq L(compare_64_pagecross) + mtctr r7 + b L(compare_64B_unaligned) + + /* nalign1 is minimum and s1 pointer is aligned. */ +L(nalign1_min): + CHECK_N_BYTES(r3,r4,r7) + /* Are we on the 64B hunk which crosses a page? */ + andi. r10,r4,63 /* Determine offset into 64B hunk. */ + andi. r8,r4,15 /* The offset into the 16B hunk. */ + neg r7,r4 + andi. r9,r7,15 /* Number of bytes after a 16B cross. */ + rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ + beq L(compare_64_pagecross) + mtctr r7 + + .p2align 5 +L(compare_64B_unaligned): + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + COMPARE_16(v4,v5,48) + addi r3,r3,64 + addi r4,r4,64 + bdnz L(compare_64B_unaligned) + + /* Cross the page boundary of s2, carefully. Only for first + iteration we have to get the count of 64B blocks to be checked. + From second iteration and beyond, loop counter is always 63. */ +L(compare_64_pagecross): + li r11, 63 + mtctr r11 + cmpldi r10,16 + ble L(cross_4) + cmpldi r10,32 + ble L(cross_3) + cmpldi r10,48 + ble L(cross_2) +L(cross_1): + CHECK_N_BYTES(r3,r4,r9) + CHECK_N_BYTES(r3,r4,r8) + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + addi r3,r3,48 + addi r4,r4,48 + b L(compare_64B_unaligned) +L(cross_2): + COMPARE_16(v4,v5,0) + addi r3,r3,16 + addi r4,r4,16 + CHECK_N_BYTES(r3,r4,r9) + CHECK_N_BYTES(r3,r4,r8) + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + addi r3,r3,32 + addi r4,r4,32 + b L(compare_64B_unaligned) +L(cross_3): + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + addi r3,r3,32 + addi r4,r4,32 + CHECK_N_BYTES(r3,r4,r9) + CHECK_N_BYTES(r3,r4,r8) + COMPARE_16(v4,v5,0) + addi r3,r3,16 + addi r4,r4,16 + b L(compare_64B_unaligned) +L(cross_4): + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + addi r3,r3,48 + addi r4,r4,48 + CHECK_N_BYTES(r3,r4,r9) + CHECK_N_BYTES(r3,r4,r8) + b L(compare_64B_unaligned) + +L(same_aligned): + CHECK_N_BYTES(r3,r4,r7) + /* Align s1 to 32B and adjust s2 address. + Use lxvp only if both s1 and s2 are 32B aligned. */ + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + COMPARE_16(v4,v5,48) + addi r3,r3,64 + addi r4,r4,64 + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + addi r5,r5,32 + + clrldi r6,r3,59 + subfic r7,r6,32 + add r3,r3,r7 + add r4,r4,r7 + subf r5,r7,r5 + andi. r7,r4,0x1F + beq cr0,L(32B_aligned_loop) + + .p2align 5 +L(16B_aligned_loop): + COMPARE_16(v4,v5,0) + COMPARE_16(v4,v5,16) + COMPARE_16(v4,v5,32) + COMPARE_16(v4,v5,48) + addi r3,r3,64 + addi r4,r4,64 + b L(16B_aligned_loop) + + /* Calculate and return the difference. */ +L(different): + TAIL_FIRST_16B(v4,v5) + + .p2align 5 +L(32B_aligned_loop): + COMPARE_32(v14,v16,0,tail1,tail2) + COMPARE_32(v18,v20,32,tail3,tail4) + COMPARE_32(v22,v24,64,tail5,tail6) + COMPARE_32(v26,v28,96,tail7,tail8) + addi r3,r3,128 + addi r4,r4,128 + b L(32B_aligned_loop) + +L(tail1): TAIL_FIRST_16B(v15,v17) +L(tail2): TAIL_SECOND_16B(v14,v16) +L(tail3): TAIL_FIRST_16B(v19,v21) +L(tail4): TAIL_SECOND_16B(v18,v20) +L(tail5): TAIL_FIRST_16B(v23,v25) +L(tail6): TAIL_SECOND_16B(v22,v24) +L(tail7): TAIL_FIRST_16B(v27,v29) +L(tail8): TAIL_SECOND_16B(v26,v28) + + .p2align 5 +L(ret0): + li r3,0 + blr + +END(STRNCMP) +libc_hidden_builtin_def(strncmp) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index a38ff46448..b847c19049 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -34,7 +34,7 @@ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ memmove-power10 memset-power10 rawmemchr-power9 \ rawmemchr-power10 strcmp-power9 strcmp-power10 \ - strncmp-power9 strcpy-power9 stpcpy-power9 \ + strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 30fd89e109..2bb47d3527 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -164,6 +164,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */ IFUNC_IMPL (i, name, strncmp, #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX, + __strncmp_power10) IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC, __strncmp_power9) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power7.S index 384edce8b9..fc784c2856 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-power7.S @@ -18,6 +18,9 @@ #define STRCHRNUL __strchrnul_power7 +#undef weak_alias +#define weak_alias(a,b) + #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-ppc64.c index 8e42504efe..7127972250 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul-ppc64.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul-ppc64.c @@ -16,4 +16,16 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul-ppc32.c> +#include <string.h> + +#if IS_IN (libc) +# define STRCHRNUL __strchrnul_ppc +extern __typeof (strchrnul) __strchrnul_ppc attribute_hidden; + +# include <string/strchrnul.c> +# undef __strchrnul +weak_alias (__strchrnul_ppc, __strchrnul) +# ifdef SHARED +__hidden_ver1 (__strchrnul_ppc, __GI___strchrnul, __strchrnul_ppc); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase_l-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncase_l-power7.c index 1587079580..664ce44345 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncase_l-power7.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncase_l-power7.c @@ -19,6 +19,9 @@ #define __strncasecmp_l __strncasecmp_l_power7 +#undef weak_alias +#define weak_alias(a,b) + #undef libc_hidden_def #define libc_hidden_def(name) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S new file mode 100644 index 0000000000..d7026c12e2 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define STRNCMP __strncmp_power10 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power10/strncmp.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c index 6178f4a432..a5ed67f766 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c @@ -29,6 +29,7 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden; extern __typeof (strncmp) __strncmp_power8 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (strncmp) __strncmp_power9 attribute_hidden; +extern __typeof (strncmp) __strncmp_power10 attribute_hidden; # endif # undef strncmp @@ -36,6 +37,9 @@ extern __typeof (strncmp) __strncmp_power9 attribute_hidden; ifunc symbol properly. */ libc_ifunc_redirected (__redirect_strncmp, strncmp, # ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __strncmp_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC) ? __strncmp_power9 : diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile index f9efb50764..04ea56559e 100644 --- a/sysdeps/pthread/Makefile +++ b/sysdeps/pthread/Makefile @@ -282,6 +282,10 @@ tests += \ tst-vfork2x \ # tests +tests-2.0 += \ + tst-pthread_kill-exited + # tests-2.0 + tests-time64 += \ tst-abstime-time64 \ tst-cnd-timedwait-time64 \ diff --git a/sysdeps/s390/fpu/libm-test-ulps b/sysdeps/s390/fpu/libm-test-ulps index a898eebb20..e24adb94e8 100644 --- a/sysdeps/s390/fpu/libm-test-ulps +++ b/sysdeps/s390/fpu/libm-test-ulps @@ -1201,6 +1201,26 @@ Function: "log2_upward": double: 1 ldouble: 1 +Function: "log2p1": +double: 1 +float: 1 +ldouble: 3 + +Function: "log2p1_downward": +double: 2 +float: 2 +ldouble: 3 + +Function: "log2p1_towardzero": +double: 2 +float: 2 +ldouble: 2 + +Function: "log2p1_upward": +double: 1 +float: 2 +ldouble: 2 + Function: "log_downward": ldouble: 1 diff --git a/sysdeps/sparc/sparc64/rtld-memset.c b/sysdeps/sparc/sparc64/rtld-memset.c index 55f3835790..a19202a620 100644 --- a/sysdeps/sparc/sparc64/rtld-memset.c +++ b/sysdeps/sparc/sparc64/rtld-memset.c @@ -1 +1,4 @@ #include <string/memset.c> +#if IS_IN(rtld) +strong_alias (memset, __memset_ultra1) +#endif diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index 415aa1f14d..ae66590e91 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -200,6 +200,7 @@ tests += \ tst-clone2 \ tst-clone3 \ tst-epoll \ + tst-epoll-ioctls \ tst-fanotify \ tst-fdopendir-o_path \ tst-getauxval \ diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h index f82bde5d30..8dceaa1a52 100644 --- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h @@ -103,3 +103,18 @@ #define HWCAP2_SVE_B16B16 (1UL << 45) #define HWCAP2_LRCPC3 (1UL << 46) #define HWCAP2_LSE128 (1UL << 47) +#define HWCAP2_FPMR (1UL << 48) +#define HWCAP2_LUT (1UL << 49) +#define HWCAP2_FAMINMAX (1UL << 50) +#define HWCAP2_F8CVT (1UL << 51) +#define HWCAP2_F8FMA (1UL << 52) +#define HWCAP2_F8DP4 (1UL << 53) +#define HWCAP2_F8DP2 (1UL << 54) +#define HWCAP2_F8E4M3 (1UL << 55) +#define HWCAP2_F8E5M2 (1UL << 56) +#define HWCAP2_SME_LUTV2 (1UL << 57) +#define HWCAP2_SME_F8F16 (1UL << 58) +#define HWCAP2_SME_F8F32 (1UL << 59) +#define HWCAP2_SME_SF8FMA (1UL << 60) +#define HWCAP2_SME_SF8DP4 (1UL << 61) +#define HWCAP2_SME_SF8DP2 (1UL << 62) diff --git a/sysdeps/unix/sysv/linux/aarch64/clone3.S b/sysdeps/unix/sysv/linux/aarch64/clone3.S index 92d69a5430..e28aaa5083 100644 --- a/sysdeps/unix/sysv/linux/aarch64/clone3.S +++ b/sysdeps/unix/sysv/linux/aarch64/clone3.S @@ -82,4 +82,3 @@ thread_start: .size thread_start, .-thread_start libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/aarch64/libm.abilist b/sysdeps/unix/sysv/linux/aarch64/libm.abilist index f0da228fbb..a4bb539670 100644 --- a/sysdeps/unix/sysv/linux/aarch64/libm.abilist +++ b/sysdeps/unix/sysv/linux/aarch64/libm.abilist @@ -1149,3 +1149,11 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist index 26c3fbf18b..b685106954 100644 --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist @@ -79,6 +79,8 @@ GLIBC_2.40 _ZGVnN2v_asinh F GLIBC_2.40 _ZGVnN2v_asinhf F GLIBC_2.40 _ZGVnN2v_atanh F GLIBC_2.40 _ZGVnN2v_atanhf F +GLIBC_2.40 _ZGVnN2v_cbrt F +GLIBC_2.40 _ZGVnN2v_cbrtf F GLIBC_2.40 _ZGVnN2v_cosh F GLIBC_2.40 _ZGVnN2v_coshf F GLIBC_2.40 _ZGVnN2v_erf F @@ -89,20 +91,29 @@ GLIBC_2.40 _ZGVnN2v_sinh F GLIBC_2.40 _ZGVnN2v_sinhf F GLIBC_2.40 _ZGVnN2v_tanh F GLIBC_2.40 _ZGVnN2v_tanhf F +GLIBC_2.40 _ZGVnN2vv_hypot F +GLIBC_2.40 _ZGVnN2vv_hypotf F +GLIBC_2.40 _ZGVnN2vv_pow F +GLIBC_2.40 _ZGVnN2vv_powf F GLIBC_2.40 _ZGVnN4v_acoshf F GLIBC_2.40 _ZGVnN4v_asinhf F GLIBC_2.40 _ZGVnN4v_atanhf F +GLIBC_2.40 _ZGVnN4v_cbrtf F GLIBC_2.40 _ZGVnN4v_coshf F GLIBC_2.40 _ZGVnN4v_erfcf F GLIBC_2.40 _ZGVnN4v_erff F GLIBC_2.40 _ZGVnN4v_sinhf F GLIBC_2.40 _ZGVnN4v_tanhf F +GLIBC_2.40 _ZGVnN4vv_hypotf F +GLIBC_2.40 _ZGVnN4vv_powf F GLIBC_2.40 _ZGVsMxv_acosh F GLIBC_2.40 _ZGVsMxv_acoshf F GLIBC_2.40 _ZGVsMxv_asinh F GLIBC_2.40 _ZGVsMxv_asinhf F GLIBC_2.40 _ZGVsMxv_atanh F GLIBC_2.40 _ZGVsMxv_atanhf F +GLIBC_2.40 _ZGVsMxv_cbrt F +GLIBC_2.40 _ZGVsMxv_cbrtf F GLIBC_2.40 _ZGVsMxv_cosh F GLIBC_2.40 _ZGVsMxv_coshf F GLIBC_2.40 _ZGVsMxv_erf F @@ -113,3 +124,7 @@ GLIBC_2.40 _ZGVsMxv_sinh F GLIBC_2.40 _ZGVsMxv_sinhf F GLIBC_2.40 _ZGVsMxv_tanh F GLIBC_2.40 _ZGVsMxv_tanhf F +GLIBC_2.40 _ZGVsMxvv_hypot F +GLIBC_2.40 _ZGVsMxvv_hypotf F +GLIBC_2.40 _ZGVsMxvv_pow F +GLIBC_2.40 _ZGVsMxvv_powf F diff --git a/sysdeps/unix/sysv/linux/alpha/libm.abilist b/sysdeps/unix/sysv/linux/alpha/libm.abilist index f5d8023d62..8ff65e3f7e 100644 --- a/sysdeps/unix/sysv/linux/alpha/libm.abilist +++ b/sysdeps/unix/sysv/linux/alpha/libm.abilist @@ -1308,3 +1308,11 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/arc/clone3.S b/sysdeps/unix/sysv/linux/arc/clone3.S index d9a2dfff3e..38a27063fb 100644 --- a/sysdeps/unix/sysv/linux/arc/clone3.S +++ b/sysdeps/unix/sysv/linux/arc/clone3.S @@ -87,4 +87,3 @@ thread_start_clone3: .size thread_start_clone3, .-thread_start_clone3 libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/arc/libm.abilist b/sysdeps/unix/sysv/linux/arc/libm.abilist index f2169f8f57..af9df2673c 100644 --- a/sysdeps/unix/sysv/linux/arc/libm.abilist +++ b/sysdeps/unix/sysv/linux/arc/libm.abilist @@ -757,3 +757,9 @@ GLIBC_2.35 fminimumf64 F GLIBC_2.35 fminimuml F GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/arm/be/libm.abilist b/sysdeps/unix/sysv/linux/arm/be/libm.abilist index f020a8a904..0e460259f3 100644 --- a/sysdeps/unix/sysv/linux/arm/be/libm.abilist +++ b/sysdeps/unix/sysv/linux/arm/be/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.4 y1l F GLIBC_2.4 yn F GLIBC_2.4 ynf F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/arm/clone3.S b/sysdeps/unix/sysv/linux/arm/clone3.S index e759de9730..094efc2f40 100644 --- a/sysdeps/unix/sysv/linux/arm/clone3.S +++ b/sysdeps/unix/sysv/linux/arm/clone3.S @@ -77,4 +77,3 @@ PSEUDO_END (__clone3) .fnend libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/arm/le/libm.abilist b/sysdeps/unix/sysv/linux/arm/le/libm.abilist index f020a8a904..0e460259f3 100644 --- a/sysdeps/unix/sysv/linux/arm/le/libm.abilist +++ b/sysdeps/unix/sysv/linux/arm/le/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.4 y1l F GLIBC_2.4 yn F GLIBC_2.4 ynf F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h index 86444f0018..5ab19a8c08 100644 --- a/sysdeps/unix/sysv/linux/bits/socket.h +++ b/sysdeps/unix/sysv/linux/bits/socket.h @@ -181,7 +181,7 @@ typedef __socklen_t socklen_t; #include <bits/sockaddr.h> /* Structure describing a generic socket address. */ -struct sockaddr +struct __attribute_struct_may_alias__ sockaddr { __SOCKADDR_COMMON (sa_); /* Common data: address family and length. */ char sa_data[14]; /* Address data. */ @@ -194,7 +194,7 @@ struct sockaddr #define _SS_PADSIZE \ (_SS_SIZE - __SOCKADDR_COMMON_SIZE - sizeof (__ss_aligntype)) -struct sockaddr_storage +struct __attribute_struct_may_alias__ sockaddr_storage { __SOCKADDR_COMMON (ss_); /* Address family, etc. */ char __ss_padding[_SS_PADSIZE]; diff --git a/sysdeps/unix/sysv/linux/csky/libm.abilist b/sysdeps/unix/sysv/linux/csky/libm.abilist index 225a5e9b81..4c1216044d 100644 --- a/sysdeps/unix/sysv/linux/csky/libm.abilist +++ b/sysdeps/unix/sysv/linux/csky/libm.abilist @@ -823,3 +823,9 @@ GLIBC_2.35 fminimumf64 F GLIBC_2.35 fminimuml F GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/dl-execstack.c b/sysdeps/unix/sysv/linux/dl-execstack.c index 3d8f3938da..b986898598 100644 --- a/sysdeps/unix/sysv/linux/dl-execstack.c +++ b/sysdeps/unix/sysv/linux/dl-execstack.c @@ -27,35 +27,30 @@ #include <sysdep.h> #include <unistd.h> -extern int __stack_prot attribute_relro attribute_hidden; - static int make_main_stack_executable (void **stack_endp) { /* This gives us the highest/lowest page that needs to be changed. */ uintptr_t page = ((uintptr_t) *stack_endp & -(intptr_t) GLRO(dl_pagesize)); - int result = 0; - if (__builtin_expect (__mprotect ((void *) page, GLRO(dl_pagesize), - __stack_prot) == 0, 1)) - goto return_success; - result = errno; - goto out; + if (__mprotect ((void *) page, GLRO(dl_pagesize), + PROT_READ | PROT_WRITE | PROT_EXEC +#if _STACK_GROWS_DOWN + | PROT_GROWSDOWN +#elif _STACK_GROWS_UP + | PROT_GROWSUP +#endif + ) != 0) + return errno; - return_success: /* Clear the address. */ *stack_endp = NULL; /* Remember that we changed the permission. */ GL(dl_stack_flags) |= PF_X; - out: -#ifdef check_consistency - check_consistency (); -#endif - - return result; + return 0; } int diff --git a/sysdeps/unix/sysv/linux/hppa/libm.abilist b/sysdeps/unix/sysv/linux/hppa/libm.abilist index 450ac03223..3ea5b611eb 100644 --- a/sysdeps/unix/sysv/linux/hppa/libm.abilist +++ b/sysdeps/unix/sysv/linux/hppa/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F GLIBC_2.4 exp2l F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/i386/clone3.S b/sysdeps/unix/sysv/linux/i386/clone3.S index abb7356b1f..6803e59644 100644 --- a/sysdeps/unix/sysv/linux/i386/clone3.S +++ b/sysdeps/unix/sysv/linux/i386/clone3.S @@ -120,4 +120,3 @@ L(thread_start): PSEUDO_END (__clone3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/i386/libm.abilist b/sysdeps/unix/sysv/linux/i386/libm.abilist index 5d89aaa08e..696e52e65a 100644 --- a/sysdeps/unix/sysv/linux/i386/libm.abilist +++ b/sysdeps/unix/sysv/linux/i386/libm.abilist @@ -1188,3 +1188,11 @@ GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F GLIBC_2.35 hypot F GLIBC_2.35 hypotf F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/loongarch/clone3.S b/sysdeps/unix/sysv/linux/loongarch/clone3.S index 20eaf4adc2..dfffb1f3d6 100644 --- a/sysdeps/unix/sysv/linux/loongarch/clone3.S +++ b/sysdeps/unix/sysv/linux/loongarch/clone3.S @@ -80,4 +80,3 @@ L (thread_start3): END (__thread_start3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist index b3dbd00001..1c6f02ac5a 100644 --- a/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist +++ b/sysdeps/unix/sysv/linux/loongarch/lp64/libm.abilist @@ -1028,3 +1028,11 @@ GLIBC_2.36 ynf32x F GLIBC_2.36 ynf64 F GLIBC_2.36 ynf64x F GLIBC_2.36 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/m68k/coldfire/libm.abilist b/sysdeps/unix/sysv/linux/m68k/coldfire/libm.abilist index f020a8a904..0e460259f3 100644 --- a/sysdeps/unix/sysv/linux/m68k/coldfire/libm.abilist +++ b/sysdeps/unix/sysv/linux/m68k/coldfire/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.4 y1l F GLIBC_2.4 yn F GLIBC_2.4 ynf F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/m68k/m680x0/libm.abilist b/sysdeps/unix/sysv/linux/m68k/m680x0/libm.abilist index e7cd739a54..be8262bb48 100644 --- a/sysdeps/unix/sysv/linux/m68k/m680x0/libm.abilist +++ b/sysdeps/unix/sysv/linux/m68k/m680x0/libm.abilist @@ -884,3 +884,9 @@ GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F GLIBC_2.35 hypot F GLIBC_2.35 hypotf F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/microblaze/be/libm.abilist b/sysdeps/unix/sysv/linux/microblaze/be/libm.abilist index 1f7f63f60c..cc091b2d4e 100644 --- a/sysdeps/unix/sysv/linux/microblaze/be/libm.abilist +++ b/sysdeps/unix/sysv/linux/microblaze/be/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/microblaze/le/libm.abilist b/sysdeps/unix/sysv/linux/microblaze/le/libm.abilist index 1f7f63f60c..cc091b2d4e 100644 --- a/sysdeps/unix/sysv/linux/microblaze/le/libm.abilist +++ b/sysdeps/unix/sysv/linux/microblaze/le/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/microblaze/syscalls.list b/sysdeps/unix/sysv/linux/microblaze/syscalls.list index 878ce7e6a2..5760c657a6 100644 --- a/sysdeps/unix/sysv/linux/microblaze/syscalls.list +++ b/sysdeps/unix/sysv/linux/microblaze/syscalls.list @@ -1,5 +1,3 @@ # File name Caller Syscall name Args Strong name Weak names -cacheflush EXTRA cacheflush i:iiii __cacheflush cacheflush - personality EXTRA personality Ei:i __personality personality diff --git a/sysdeps/unix/sysv/linux/mips/clone3.S b/sysdeps/unix/sysv/linux/mips/clone3.S index 481b8ae963..58925d6b41 100644 --- a/sysdeps/unix/sysv/linux/mips/clone3.S +++ b/sysdeps/unix/sysv/linux/mips/clone3.S @@ -158,4 +158,3 @@ L(thread_start_clone3): END(__thread_start_clone3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/mips/mips32/libm.abilist b/sysdeps/unix/sysv/linux/mips/mips32/libm.abilist index 797071aee8..4d442a989d 100644 --- a/sysdeps/unix/sysv/linux/mips/mips32/libm.abilist +++ b/sysdeps/unix/sysv/linux/mips/mips32/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F GLIBC_2.4 exp2l F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/mips/mips64/libm.abilist b/sysdeps/unix/sysv/linux/mips/mips64/libm.abilist index 14758703cf..ae7b4a7558 100644 --- a/sysdeps/unix/sysv/linux/mips/mips64/libm.abilist +++ b/sysdeps/unix/sysv/linux/mips/mips64/libm.abilist @@ -1149,3 +1149,11 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/net/if_packet.h b/sysdeps/unix/sysv/linux/net/if_packet.h index 9ffb69b508..c17e1c23c5 100644 --- a/sysdeps/unix/sysv/linux/net/if_packet.h +++ b/sysdeps/unix/sysv/linux/net/if_packet.h @@ -26,7 +26,7 @@ From Linux 2.1 the AF_PACKET interface is preferred and you should consider using it in place of this one. */ -struct sockaddr_pkt +struct __attribute_struct_may_alias__ sockaddr_pkt { __SOCKADDR_COMMON (spkt_); unsigned char spkt_device[14]; diff --git a/sysdeps/unix/sysv/linux/netash/ash.h b/sysdeps/unix/sysv/linux/netash/ash.h index 7d885d17cc..7a6ff50b17 100644 --- a/sysdeps/unix/sysv/linux/netash/ash.h +++ b/sysdeps/unix/sysv/linux/netash/ash.h @@ -22,7 +22,7 @@ #include <features.h> #include <bits/sockaddr.h> -struct sockaddr_ash +struct __attribute_struct_may_alias__ sockaddr_ash { __SOCKADDR_COMMON (sash_); /* Common data: address family etc. */ int sash_ifindex; /* Interface to use. */ diff --git a/sysdeps/unix/sysv/linux/neteconet/ec.h b/sysdeps/unix/sysv/linux/neteconet/ec.h index b07a107961..f3132f06ff 100644 --- a/sysdeps/unix/sysv/linux/neteconet/ec.h +++ b/sysdeps/unix/sysv/linux/neteconet/ec.h @@ -28,7 +28,7 @@ struct ec_addr unsigned char net; /* Network number. */ }; -struct sockaddr_ec +struct __attribute_struct_may_alias__ sockaddr_ec { __SOCKADDR_COMMON (sec_); unsigned char port; /* Port number. */ diff --git a/sysdeps/unix/sysv/linux/netiucv/iucv.h b/sysdeps/unix/sysv/linux/netiucv/iucv.h index f5fad81751..27151e8bbe 100644 --- a/sysdeps/unix/sysv/linux/netiucv/iucv.h +++ b/sysdeps/unix/sysv/linux/netiucv/iucv.h @@ -23,7 +23,7 @@ __BEGIN_DECLS -struct sockaddr_iucv +struct __attribute_struct_may_alias__ sockaddr_iucv { __SOCKADDR_COMMON (siucv_); unsigned short siucv_port; /* Reserved */ diff --git a/sysdeps/unix/sysv/linux/nios2/libm.abilist b/sysdeps/unix/sysv/linux/nios2/libm.abilist index c0ebe119dc..9ed3fbdee4 100644 --- a/sysdeps/unix/sysv/linux/nios2/libm.abilist +++ b/sysdeps/unix/sysv/linux/nios2/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/or1k/Versions b/sysdeps/unix/sysv/linux/or1k/Versions new file mode 100644 index 0000000000..c1299de116 --- /dev/null +++ b/sysdeps/unix/sysv/linux/or1k/Versions @@ -0,0 +1,14 @@ +libc { + GLIBC_2.35 { + getcontext; + setcontext; + swapcontext; + makecontext; + } + GLIBC_2.40 { + getcontext; + setcontext; + swapcontext; + makecontext; + } +} diff --git a/sysdeps/unix/sysv/linux/or1k/getcontext-common.S b/sysdeps/unix/sysv/linux/or1k/getcontext-common.S new file mode 100644 index 0000000000..9187749615 --- /dev/null +++ b/sysdeps/unix/sysv/linux/or1k/getcontext-common.S @@ -0,0 +1,88 @@ +/* Save current context. OpenRISC common version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* This common getcontext template helps define different + implementations using control macros. Before including + this file in another file define the following: + + __CONTEXT_FUNC_NAME + __CONTEXT_ENABLE_FPCSR + __CONTEXT_SIGMASK_OFFSET + */ + +/* int getcontext (ucontext_t *ucp) + + Returns 0 on success -1 and errno on failure. + */ + .text +ENTRY(__CONTEXT_FUNC_NAME) + /* Store r1, the stack pointer. */ + l.sw (UCONTEXT_MCONTEXT + 1*4)(r3), r1 + /* Store r2, the frame pointer. */ + l.sw (UCONTEXT_MCONTEXT + 2*4)(r3), r2 + /* Store r9, the link register. */ + l.sw (UCONTEXT_MCONTEXT + 9*4)(r3), r9 + /* Store r9 to reg[11] too, as we need two links for makecontext. */ + l.sw (UCONTEXT_MCONTEXT + 11*4)(r3), r9 + /* Store r10, the TLS register. */ + l.sw (UCONTEXT_MCONTEXT + 10*4)(r3), r10 + /* Store r14-r30 even, callee saved registers. */ + l.sw (UCONTEXT_MCONTEXT + 14*4)(r3), r14 + l.sw (UCONTEXT_MCONTEXT + 16*4)(r3), r16 + l.sw (UCONTEXT_MCONTEXT + 18*4)(r3), r18 + l.sw (UCONTEXT_MCONTEXT + 20*4)(r3), r20 + l.sw (UCONTEXT_MCONTEXT + 22*4)(r3), r22 + l.sw (UCONTEXT_MCONTEXT + 24*4)(r3), r24 + l.sw (UCONTEXT_MCONTEXT + 26*4)(r3), r26 + l.sw (UCONTEXT_MCONTEXT + 28*4)(r3), r28 + l.sw (UCONTEXT_MCONTEXT + 30*4)(r3), r30 + +#ifdef __CONTEXT_ENABLE_FPCSR +# ifdef __or1k_hard_float__ + /* Store the floating point state. */ + l.mfspr r4, r0, 20 + l.sw (MCONTEXT_FPCSR)(r3), r4 +# else + /* Store zero to indicate default rounding as per softfloat. */ + l.sw (MCONTEXT_FPCSR)(r3), r0 +# endif /* __or1k_hard_float__ */ +#endif /* __CONTEXT_ENABLE_FPCSR */ + + /* Get signal mask. */ + /* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */ + l.ori r6, r0, _NSIG8 + l.addi r5, r3, __CONTEXT_SIGMASK_OFFSET + l.ori r4, r0, 0 + l.ori r3, r0, SIG_BLOCK + l.ori r11, r0, SYS_ify (rt_sigprocmask) + /* Do the syscall. */ + l.sys 1 + l.nop + + /* if -4096 < ret < 0 holds, it's an error */ + l.sfgeui r11, 0xf001 + l.bf 1f + l.nop + + l.jr r9 + l.ori r11, r0, 0 + +1: l.j __syscall_error + l.ori r3, r11, 0 + +END(__CONTEXT_FUNC_NAME) diff --git a/sysdeps/unix/sysv/linux/or1k/getcontext.S b/sysdeps/unix/sysv/linux/or1k/getcontext.S index a25b377bda..da69e6999c 100644 --- a/sysdeps/unix/sysv/linux/or1k/getcontext.S +++ b/sysdeps/unix/sysv/linux/or1k/getcontext.S @@ -17,56 +17,35 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <shlib-compat.h> #include "ucontext_i.h" -/* int getcontext (ucontext_t *ucp) +#define __CONTEXT_FUNC_NAME __getcontext +#define __CONTEXT_ENABLE_FPCSR 1 +#define __CONTEXT_SIGMASK_OFFSET UCONTEXT_SIGMASK - Returns 0 on success -1 and errno on failure. - */ - .text -ENTRY(__getcontext) - /* Store r1, the stack pointer. */ - l.sw (UCONTEXT_MCONTEXT + 1*4)(r3), r1 - /* Store r2, the frame pointer. */ - l.sw (UCONTEXT_MCONTEXT + 2*4)(r3), r2 - /* Store r9, the link register. */ - l.sw (UCONTEXT_MCONTEXT + 9*4)(r3), r9 - /* Store r9 to reg[11] too, as we need two links for makecontext. */ - l.sw (UCONTEXT_MCONTEXT + 11*4)(r3), r9 - /* Store r10, the TLS register. */ - l.sw (UCONTEXT_MCONTEXT + 10*4)(r3), r10 - /* Store r14-r30 even, callee saved registers. */ - l.sw (UCONTEXT_MCONTEXT + 14*4)(r3), r14 - l.sw (UCONTEXT_MCONTEXT + 16*4)(r3), r16 - l.sw (UCONTEXT_MCONTEXT + 18*4)(r3), r18 - l.sw (UCONTEXT_MCONTEXT + 20*4)(r3), r20 - l.sw (UCONTEXT_MCONTEXT + 22*4)(r3), r22 - l.sw (UCONTEXT_MCONTEXT + 24*4)(r3), r24 - l.sw (UCONTEXT_MCONTEXT + 26*4)(r3), r26 - l.sw (UCONTEXT_MCONTEXT + 28*4)(r3), r28 - l.sw (UCONTEXT_MCONTEXT + 30*4)(r3), r30 +#include "getcontext-common.S" - /* Get signal mask. */ - /* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */ - l.ori r6, r0, _NSIG8 - l.addi r5, r3, UCONTEXT_SIGMASK - l.ori r4, r0, 0 - l.ori r3, r0, SIG_BLOCK - l.ori r11, r0, SYS_ify (rt_sigprocmask) - /* Do the syscall. */ - l.sys 1 - l.nop +versioned_symbol (libc, __getcontext, getcontext, GLIBC_2_40) - /* if -4096 < ret < 0 holds, it's an error */ - l.sfgeui r11, 0xf001 - l.bf 1f - l.nop +#if SHLIB_COMPAT (libc, GLIBC_2_35, GLIBC_2_40) - l.jr r9 - l.ori r11, r0, 0 +/* Define a compat version of getcontext for glibc's before the fpcsr + field was added to mcontext_t. The offset sigmask changed with this + introduction, the change was done because glibc's definition of + ucontext_t was initially defined incompatible with the Linux + definition of ucontext_t. We keep the compatability definition to + allow getcontext, setcontext and swapcontext to work in older + binaries. */ -1: l.j __syscall_error - l.ori r3, r11, 0 +# undef __CONTEXT_FUNC_NAME +# undef __CONTEXT_ENABLE_FPCSR +# undef __CONTEXT_SIGMASK_OFFSET +# define __CONTEXT_FUNC_NAME __getcontext_nofpcsr +# define __CONTEXT_SIGMASK_OFFSET (UCONTEXT_SIGMASK - 4) -END(__getcontext) -weak_alias(__getcontext, getcontext) +# include "getcontext-common.S" + +compat_symbol (libc, __getcontext_nofpcsr, getcontext, GLIBC_2_35) + +#endif diff --git a/sysdeps/unix/sysv/linux/or1k/libc.abilist b/sysdeps/unix/sysv/linux/or1k/libc.abilist index c40c843aaf..959e59e7e7 100644 --- a/sysdeps/unix/sysv/linux/or1k/libc.abilist +++ b/sysdeps/unix/sysv/linux/or1k/libc.abilist @@ -2255,3 +2255,7 @@ GLIBC_2.39 stdc_trailing_zeros_ui F GLIBC_2.39 stdc_trailing_zeros_ul F GLIBC_2.39 stdc_trailing_zeros_ull F GLIBC_2.39 stdc_trailing_zeros_us F +GLIBC_2.40 getcontext F +GLIBC_2.40 makecontext F +GLIBC_2.40 setcontext F +GLIBC_2.40 swapcontext F diff --git a/sysdeps/unix/sysv/linux/or1k/libm.abilist b/sysdeps/unix/sysv/linux/or1k/libm.abilist index 9d26508ff5..69655f7d23 100644 --- a/sysdeps/unix/sysv/linux/or1k/libm.abilist +++ b/sysdeps/unix/sysv/linux/or1k/libm.abilist @@ -757,3 +757,9 @@ GLIBC_2.35 ynf32 F GLIBC_2.35 ynf32x F GLIBC_2.35 ynf64 F GLIBC_2.35 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/or1k/makecontext.c b/sysdeps/unix/sysv/linux/or1k/makecontext.c index fa6626e7de..7e131bae41 100644 --- a/sysdeps/unix/sysv/linux/or1k/makecontext.c +++ b/sysdeps/unix/sysv/linux/or1k/makecontext.c @@ -16,6 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include <shlib-compat.h> #include <sysdep.h> #include <stdarg.h> #include <stdint.h> @@ -36,12 +37,11 @@ r1 : stack pointer r2 : frame pointer, set to NULL */ -void -__makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) +static void +do_makecontext (ucontext_t *ucp, void (*startcontext) (void), + void (*func) (void), int argc, va_list ap) { - extern void __startcontext (void); unsigned long int *sp; - va_list ap; int i; sp = (unsigned long int *) @@ -55,8 +55,8 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) /* Keep uc_link in r14. */ ucp->uc_mcontext.__gprs[14] = (uintptr_t) ucp->uc_link; - /* Return address points to function __startcontext. */ - ucp->uc_mcontext.__gprs[9] = (uintptr_t) &__startcontext; + /* Return address points to function startcontext. */ + ucp->uc_mcontext.__gprs[9] = (uintptr_t) startcontext; /* Frame pointer is null. */ ucp->uc_mcontext.__gprs[2] = (uintptr_t) 0; /* Restart in user-space starting at 'func'. */ @@ -64,14 +64,47 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) /* Set stack pointer. */ ucp->uc_mcontext.__gprs[1] = (uintptr_t) sp; - va_start (ap, argc); for (i = 0; i < argc; ++i) if (i < 6) ucp->uc_mcontext.__gprs[i + 3] = va_arg (ap, unsigned long int); else sp[i - 6] = va_arg (ap, unsigned long int); +} +void +__makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) +{ + extern void __startcontext (void); + va_list ap; + + va_start (ap, argc); + do_makecontext (ucp, &__startcontext, func, argc, ap); va_end (ap); } -weak_alias (__makecontext, makecontext) +versioned_symbol (libc, __makecontext, makecontext, GLIBC_2_40); + +#if SHLIB_COMPAT (libc, GLIBC_2_35, GLIBC_2_40) + +/* Define a compat version of makecontext for glibc's before the fpcsr + field was added to mcontext_t. The offset sigmask changed with this + introduction, the change was done because glibc's definition of + ucontext_t was initially defined incompatible with the Linux + definition of ucontext_t. We keep the compatability definition to + allow getcontext, setcontext and swapcontext to work in older + binaries. */ + +void +__makecontext_nofpcsr (ucontext_t *ucp, void (*func) (void), int argc, ...) +{ + extern void __startcontext_nofpcsr (void); + va_list ap; + + va_start (ap, argc); + do_makecontext (ucp, &__startcontext_nofpcsr, func, argc, ap); + va_end (ap); +} + +compat_symbol (libc, __makecontext_nofpcsr, makecontext, GLIBC_2_35); + +#endif diff --git a/sysdeps/unix/sysv/linux/or1k/setcontext-common.S b/sysdeps/unix/sysv/linux/or1k/setcontext-common.S new file mode 100644 index 0000000000..8a4f147513 --- /dev/null +++ b/sysdeps/unix/sysv/linux/or1k/setcontext-common.S @@ -0,0 +1,120 @@ +/* Set current context. OpenRISC common version. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* This common setcontext and startcontext template helps define + different implementations using control macros. Before including + this file in another file define the following: + + __CONTEXT_FUNC_NAME + __CONTEXT_ENABLE_FPCSR + __CONTEXT_SIGMASK_OFFSET + __STARTCONTEXT_FUNC_NAME + */ + +/* int setcontext (const ucontext_t *ucp) */ + .text +ENTRY(__CONTEXT_FUNC_NAME) + l.ori r30, r3, 0 + + /* Restore signal mask. */ + /* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */ + l.ori r6, r0, _NSIG8 + l.ori r5, r0, 0 + l.addi r4, r3, __CONTEXT_SIGMASK_OFFSET + l.ori r3, r0, SIG_SETMASK + l.ori r11, r0, SYS_ify (rt_sigprocmask) + /* Do the syscall. */ + l.sys 1 + l.nop + + /* if -4096 < ret < 0 holds, it's an error */ + l.sfgeui r11, 0xf001 + l.bf 1f + l.nop +#ifdef __CONTEXT_ENABLE_FPCSR +# ifdef __or1k_hard_float__ + /* Restore the floating point state. */ + l.lwz r28, (MCONTEXT_FPCSR)(r30) + l.mtspr r0, r28, 20 +# endif /* __or1k_hard_float__ */ +#endif /* __CONTEXT_ENABLE_FPCSR */ + /* Restore argument registers, for the makecontext case. */ + l.lwz r3, (UCONTEXT_MCONTEXT + 3*4)(r30) + l.lwz r4, (UCONTEXT_MCONTEXT + 4*4)(r30) + l.lwz r5, (UCONTEXT_MCONTEXT + 5*4)(r30) + l.lwz r6, (UCONTEXT_MCONTEXT + 6*4)(r30) + l.lwz r7, (UCONTEXT_MCONTEXT + 7*4)(r30) + l.lwz r8, (UCONTEXT_MCONTEXT + 8*4)(r30) + + /* Restore registers stored in getcontext. */ + l.lwz r1, (UCONTEXT_MCONTEXT + 1*4)(r30) + l.lwz r2, (UCONTEXT_MCONTEXT + 2*4)(r30) + l.lwz r9, (UCONTEXT_MCONTEXT + 9*4)(r30) + l.lwz r10, (UCONTEXT_MCONTEXT + 10*4)(r30) + l.lwz r11, (UCONTEXT_MCONTEXT + 11*4)(r30) + /* Restore r14-r30 even, callee saved registers. */ + l.lwz r14, (UCONTEXT_MCONTEXT + 14*4)(r30) + l.lwz r16, (UCONTEXT_MCONTEXT + 16*4)(r30) + l.lwz r18, (UCONTEXT_MCONTEXT + 18*4)(r30) + l.lwz r20, (UCONTEXT_MCONTEXT + 20*4)(r30) + l.lwz r22, (UCONTEXT_MCONTEXT + 22*4)(r30) + l.lwz r24, (UCONTEXT_MCONTEXT + 24*4)(r30) + l.lwz r26, (UCONTEXT_MCONTEXT + 26*4)(r30) + l.lwz r28, (UCONTEXT_MCONTEXT + 28*4)(r30) + l.lwz r30, (UCONTEXT_MCONTEXT + 30*4)(r30) + + l.jr r11 + l.ori r11, r0, 0 + +1: l.j __syscall_error + l.ori r3, r11, 0 + +END (__CONTEXT_FUNC_NAME) + + /* We add a NOP here because when the unwinder is looking for the + enclosing function of the link register (r9) address FDE lookup will + use '$r9 - 1' finding setcontext which is wrong. This is because in + makecontext we have set r9 to the start of &__startcontext. + + If this NOP did not exist the unwinder would repeatedly find + __setcontext's FDE in an infinite loop. Modifying/deleting the below + __startcontext's FDE has no help on this. */ + l.nop + +ENTRY(__STARTCONTEXT_FUNC_NAME) + + l.ori r3, r14, 0 + l.sfeq r3, r0 + /* If uc_link is not 0 resume there, otherwise exit. */ + l.bnf __CONTEXT_FUNC_NAME + l.nop + +#ifdef SHARED + /* Obtain a pointer to .got in r16 */ + l.jal 0x8 + l.movhi r16, gotpchi(_GLOBAL_OFFSET_TABLE_-4) + l.ori r16, r16, gotpclo(_GLOBAL_OFFSET_TABLE_+0) + l.add r16, r16, r9 + l.lwz r16, got(exit)(r16) + l.jr r16 +#else + l.j exit +#endif + l.nop + +END(__STARTCONTEXT_FUNC_NAME) diff --git a/sysdeps/unix/sysv/linux/or1k/setcontext.S b/sysdeps/unix/sysv/linux/or1k/setcontext.S index d28a0ac0aa..a49a5c51c3 100644 --- a/sysdeps/unix/sysv/linux/or1k/setcontext.S +++ b/sysdeps/unix/sysv/linux/or1k/setcontext.S @@ -16,93 +16,39 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include <shlib-compat.h> #include <sysdep.h> #include "ucontext_i.h" -/* int setcontext (const ucontext_t *ucp) */ - .text -ENTRY(__setcontext) - l.ori r30, r3, 0 +#define __CONTEXT_FUNC_NAME __setcontext +#define __CONTEXT_ENABLE_FPCSR 1 +#define __CONTEXT_SIGMASK_OFFSET UCONTEXT_SIGMASK +#define __STARTCONTEXT_FUNC_NAME __startcontext - /* Restore signal mask. */ - /* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */ - l.ori r6, r0, _NSIG8 - l.ori r5, r0, 0 - l.addi r4, r3, UCONTEXT_SIGMASK - l.ori r3, r0, SIG_SETMASK - l.ori r11, r0, SYS_ify (rt_sigprocmask) - /* Do the syscall. */ - l.sys 1 - l.nop +#include "setcontext-common.S" - /* if -4096 < ret < 0 holds, it's an error */ - l.sfgeui r11, 0xf001 - l.bf 1f - l.nop +versioned_symbol (libc, __setcontext, setcontext, GLIBC_2_40) - /* Restore argument registers, for the makecontext case. */ - l.lwz r3, (UCONTEXT_MCONTEXT + 3*4)(r30) - l.lwz r4, (UCONTEXT_MCONTEXT + 4*4)(r30) - l.lwz r5, (UCONTEXT_MCONTEXT + 5*4)(r30) - l.lwz r6, (UCONTEXT_MCONTEXT + 6*4)(r30) - l.lwz r7, (UCONTEXT_MCONTEXT + 7*4)(r30) - l.lwz r8, (UCONTEXT_MCONTEXT + 8*4)(r30) +#if SHLIB_COMPAT (libc, GLIBC_2_35, GLIBC_2_40) - /* Restore registers stored in getcontext. */ - l.lwz r1, (UCONTEXT_MCONTEXT + 1*4)(r30) - l.lwz r2, (UCONTEXT_MCONTEXT + 2*4)(r30) - l.lwz r9, (UCONTEXT_MCONTEXT + 9*4)(r30) - l.lwz r10, (UCONTEXT_MCONTEXT + 10*4)(r30) - l.lwz r11, (UCONTEXT_MCONTEXT + 11*4)(r30) - /* Restore r14-r30 even, callee saved registers. */ - l.lwz r14, (UCONTEXT_MCONTEXT + 14*4)(r30) - l.lwz r16, (UCONTEXT_MCONTEXT + 16*4)(r30) - l.lwz r18, (UCONTEXT_MCONTEXT + 18*4)(r30) - l.lwz r20, (UCONTEXT_MCONTEXT + 20*4)(r30) - l.lwz r22, (UCONTEXT_MCONTEXT + 22*4)(r30) - l.lwz r24, (UCONTEXT_MCONTEXT + 24*4)(r30) - l.lwz r26, (UCONTEXT_MCONTEXT + 26*4)(r30) - l.lwz r28, (UCONTEXT_MCONTEXT + 28*4)(r30) - l.lwz r30, (UCONTEXT_MCONTEXT + 30*4)(r30) +/* Define a compat version of setcontext for glibc's before the fpcsr + field was added to mcontext_t. The offset sigmask changed with this + introduction, the change was done because glibc's definition of + ucontext_t was initially defined incompatible with the Linux + definition of ucontext_t. We keep the compatability definition to + allow getcontext, setcontext and swapcontext to work in older + binaries. */ - l.jr r11 - l.ori r11, r0, 0 +# undef __CONTEXT_FUNC_NAME +# undef __CONTEXT_ENABLE_FPCSR +# undef __CONTEXT_SIGMASK_OFFSET +# undef __STARTCONTEXT_FUNC_NAME +# define __CONTEXT_FUNC_NAME __setcontext_nofpcsr +# define __CONTEXT_SIGMASK_OFFSET (UCONTEXT_SIGMASK - 4) +# define __STARTCONTEXT_FUNC_NAME __startcontext_nofpcsr -1: l.j __syscall_error - l.ori r3, r11, 0 +# include "setcontext-common.S" -END (__setcontext) -weak_alias (__setcontext, setcontext) +compat_symbol (libc, __setcontext_nofpcsr, setcontext, GLIBC_2_35) - /* We add a NOP here because when the unwinder is looking for the - enclosing function of the link register (r9) address FDE lookup will - use '$r9 - 1' finding setcontext which is wrong. This is because in - makecontext we have set r9 to the start of &__startcontext. - - If this NOP did not exist the unwinder would repeatedly find - __setcontext's FDE in an infinite loop. Modifying/deleting the below - __startcontext's FDE has no help on this. */ - l.nop - -ENTRY(__startcontext) - - l.ori r3, r14, 0 - l.sfeq r3, r0 - /* If uc_link is not 0 resume there, otherwise exit. */ - l.bnf __setcontext - l.nop - -#ifdef SHARED - /* Obtain a pointer to .got in r16 */ - l.jal 0x8 - l.movhi r16, gotpchi(_GLOBAL_OFFSET_TABLE_-4) - l.ori r16, r16, gotpclo(_GLOBAL_OFFSET_TABLE_+0) - l.add r16, r16, r9 - l.lwz r16, got(exit)(r16) - l.jr r16 -#else - l.j exit #endif - l.nop - -END(__startcontext) diff --git a/sysdeps/unix/sysv/linux/or1k/swapcontext-common.S b/sysdeps/unix/sysv/linux/or1k/swapcontext-common.S new file mode 100644 index 0000000000..b7e2d4c820 --- /dev/null +++ b/sysdeps/unix/sysv/linux/or1k/swapcontext-common.S @@ -0,0 +1,139 @@ +/* Swap two contexts. OpenRISC version. + Copyright (C) 2022-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* This common swapcontext template helps define different + implementations using control macros. Before including this file + in another file define the following: + + __CONTEXT_FUNC_NAME + __CONTEXT_ENABLE_FPCSR + __CONTEXT_SIGMASK_OFFSET + */ + +/* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */ + .text +ENTRY(__CONTEXT_FUNC_NAME) + + /* Same as getcontext. */ + /* Store r1, the stack pointer. */ + l.sw (UCONTEXT_MCONTEXT + 1*4)(r3), r1 + /* Store r2, the frame pointer. */ + l.sw (UCONTEXT_MCONTEXT + 2*4)(r3), r2 + /* Store r9, the link register. */ + l.sw (UCONTEXT_MCONTEXT + 9*4)(r3), r9 + /* Store r9 to reg[11] too, as we need two links for makecontext. */ + l.sw (UCONTEXT_MCONTEXT + 11*4)(r3), r9 + /* Store r10, the TLS register. */ + l.sw (UCONTEXT_MCONTEXT + 10*4)(r3), r10 + /* Store r14-r30 even, callee saved registers. */ + l.sw (UCONTEXT_MCONTEXT + 14*4)(r3), r14 + l.sw (UCONTEXT_MCONTEXT + 16*4)(r3), r16 + l.sw (UCONTEXT_MCONTEXT + 18*4)(r3), r18 + l.sw (UCONTEXT_MCONTEXT + 20*4)(r3), r20 + l.sw (UCONTEXT_MCONTEXT + 22*4)(r3), r22 + l.sw (UCONTEXT_MCONTEXT + 24*4)(r3), r24 + l.sw (UCONTEXT_MCONTEXT + 26*4)(r3), r26 + l.sw (UCONTEXT_MCONTEXT + 28*4)(r3), r28 + l.sw (UCONTEXT_MCONTEXT + 30*4)(r3), r30 + +#ifdef __CONTEXT_ENABLE_FPCSR +# ifdef __or1k_hard_float__ + /* Store the floating point state. */ + l.mfspr r6, r0, 20 + l.sw (MCONTEXT_FPCSR)(r3), r6 +# else + /* Store zero to indicate default rounding as per softfloat. */ + l.sw (MCONTEXT_FPCSR)(r3), r0 +# endif /* __or1k_hard_float__ */ +#endif /* __CONTEXT_ENABLE_FPCSR */ + /* Store ucp to non-argument syscall preserved register. */ + l.ori r30, r4, 0 + + /* Get signal mask. */ + /* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */ + l.ori r6, r0, _NSIG8 + l.addi r5, r3, __CONTEXT_SIGMASK_OFFSET + l.ori r4, r0, 0 + l.ori r3, r0, SIG_BLOCK + l.ori r11, r0, SYS_ify (rt_sigprocmask) + /* Do the syscall. */ + l.sys 1 + l.nop + + /* if -4096 < ret < 0 holds, it's an error */ + l.sfgeui r11, 0xf001 + l.bf 1f + l.nop + + /* Same as setcontext. */ + + /* Restore signal mask. */ + /* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */ + l.ori r6, r0, _NSIG8 + l.ori r5, r0, 0 + l.addi r4, r30, __CONTEXT_SIGMASK_OFFSET + l.ori r3, r0, SIG_SETMASK + l.ori r11, r0, SYS_ify (rt_sigprocmask) + /* Do the syscall. */ + l.sys 1 + l.nop + + /* if -4096 < ret < 0 holds, it's an error */ + l.sfgeui r11, 0xf001 + l.bf 1f + l.nop + +#ifdef __CONTEXT_ENABLE_FPCSR +# ifdef __or1k_hard_float__ + /* Restore the floating point state. */ + l.lwz r28, (MCONTEXT_FPCSR)(r30) + l.mtspr r0, r28, 20 +# endif /* __or1k_hard_float__ */ +#endif /* __CONTEXT_ENABLE_FPCSR */ + + /* Restore argument registers, for the makecontext case. */ + l.lwz r3, (UCONTEXT_MCONTEXT + 3*4)(r30) + l.lwz r4, (UCONTEXT_MCONTEXT + 4*4)(r30) + l.lwz r5, (UCONTEXT_MCONTEXT + 5*4)(r30) + l.lwz r6, (UCONTEXT_MCONTEXT + 6*4)(r30) + l.lwz r7, (UCONTEXT_MCONTEXT + 7*4)(r30) + l.lwz r8, (UCONTEXT_MCONTEXT + 8*4)(r30) + + /* Restore registers stored in getcontext. */ + l.lwz r1, (UCONTEXT_MCONTEXT + 1*4)(r30) + l.lwz r2, (UCONTEXT_MCONTEXT + 2*4)(r30) + l.lwz r9, (UCONTEXT_MCONTEXT + 9*4)(r30) + l.lwz r10, (UCONTEXT_MCONTEXT + 10*4)(r30) + l.lwz r11, (UCONTEXT_MCONTEXT + 11*4)(r30) + l.lwz r14, (UCONTEXT_MCONTEXT + 14*4)(r30) + l.lwz r16, (UCONTEXT_MCONTEXT + 16*4)(r30) + l.lwz r18, (UCONTEXT_MCONTEXT + 18*4)(r30) + l.lwz r20, (UCONTEXT_MCONTEXT + 20*4)(r30) + l.lwz r22, (UCONTEXT_MCONTEXT + 22*4)(r30) + l.lwz r24, (UCONTEXT_MCONTEXT + 24*4)(r30) + l.lwz r26, (UCONTEXT_MCONTEXT + 26*4)(r30) + l.lwz r28, (UCONTEXT_MCONTEXT + 28*4)(r30) + l.lwz r30, (UCONTEXT_MCONTEXT + 30*4)(r30) + + l.jr r11 + l.ori r11, r0, 0 + +1: l.j __syscall_error + l.ori r3, r11, 0 + +END (__CONTEXT_FUNC_NAME) diff --git a/sysdeps/unix/sysv/linux/or1k/swapcontext.S b/sysdeps/unix/sysv/linux/or1k/swapcontext.S index d09651a5b2..861c1e26ba 100644 --- a/sysdeps/unix/sysv/linux/or1k/swapcontext.S +++ b/sysdeps/unix/sysv/linux/or1k/swapcontext.S @@ -16,101 +16,36 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include <shlib-compat.h> #include <sysdep.h> #include "ucontext_i.h" -/* int swapcontext (ucontext_t *oucp, const ucontext_t *ucp) */ - .text -ENTRY(__swapcontext) +#define __CONTEXT_FUNC_NAME __swapcontext +#define __CONTEXT_ENABLE_FPCSR 1 +#define __CONTEXT_SIGMASK_OFFSET UCONTEXT_SIGMASK - /* Same as getcontext. */ - /* Store r1, the stack pointer. */ - l.sw (UCONTEXT_MCONTEXT + 1*4)(r3), r1 - /* Store r2, the frame pointer. */ - l.sw (UCONTEXT_MCONTEXT + 2*4)(r3), r2 - /* Store r9, the link register. */ - l.sw (UCONTEXT_MCONTEXT + 9*4)(r3), r9 - /* Store r9 to reg[11] too, as we need two links for makecontext. */ - l.sw (UCONTEXT_MCONTEXT + 11*4)(r3), r9 - /* Store r10, the TLS register. */ - l.sw (UCONTEXT_MCONTEXT + 10*4)(r3), r10 - /* Store r14-r30 even, callee saved registers. */ - l.sw (UCONTEXT_MCONTEXT + 14*4)(r3), r14 - l.sw (UCONTEXT_MCONTEXT + 16*4)(r3), r16 - l.sw (UCONTEXT_MCONTEXT + 18*4)(r3), r18 - l.sw (UCONTEXT_MCONTEXT + 20*4)(r3), r20 - l.sw (UCONTEXT_MCONTEXT + 22*4)(r3), r22 - l.sw (UCONTEXT_MCONTEXT + 24*4)(r3), r24 - l.sw (UCONTEXT_MCONTEXT + 26*4)(r3), r26 - l.sw (UCONTEXT_MCONTEXT + 28*4)(r3), r28 - l.sw (UCONTEXT_MCONTEXT + 30*4)(r3), r30 +#include "swapcontext-common.S" - /* Store ucp to non-argument syscall preserved register. */ - l.ori r30, r4, 0 +versioned_symbol (libc, __swapcontext, swapcontext, GLIBC_2_40) - /* Get signal mask. */ - /* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */ - l.ori r6, r0, _NSIG8 - l.addi r5, r3, UCONTEXT_SIGMASK - l.ori r4, r0, 0 - l.ori r3, r0, SIG_BLOCK - l.ori r11, r0, SYS_ify (rt_sigprocmask) - /* Do the syscall. */ - l.sys 1 - l.nop +#if SHLIB_COMPAT (libc, GLIBC_2_35, GLIBC_2_40) - /* if -4096 < ret < 0 holds, it's an error */ - l.sfgeui r11, 0xf001 - l.bf 1f - l.nop +/* Define a compat version of swapcontext for glibc's before the fpcsr + field was added to mcontext_t. The offset sigmask changed with this + introduction, the change was done because glibc's definition of + ucontext_t was initially defined incompatible with the Linux + definition of ucontext_t. We keep the compatability definition to + allow getcontext, setcontext and swapcontext to work in older + binaries. */ - /* Same as setcontext. */ +# undef __CONTEXT_FUNC_NAME +# undef __CONTEXT_ENABLE_FPCSR +# undef __CONTEXT_SIGMASK_OFFSET +# define __CONTEXT_FUNC_NAME __swapcontext_nofpcsr +# define __CONTEXT_SIGMASK_OFFSET (UCONTEXT_SIGMASK - 4) - /* Restore signal mask. */ - /* rt_sigprocmask (SIG_SETMASK, &ucp->uc_sigmask, NULL, _NSIG8) */ - l.ori r6, r0, _NSIG8 - l.ori r5, r0, 0 - l.addi r4, r30, UCONTEXT_SIGMASK - l.ori r3, r0, SIG_SETMASK - l.ori r11, r0, SYS_ify (rt_sigprocmask) - /* Do the syscall. */ - l.sys 1 - l.nop +# include "swapcontext-common.S" - /* if -4096 < ret < 0 holds, it's an error */ - l.sfgeui r11, 0xf001 - l.bf 1f - l.nop +compat_symbol (libc, __swapcontext_nofpcsr, swapcontext, GLIBC_2_35) - /* Restore argument registers, for the makecontext case. */ - l.lwz r3, (UCONTEXT_MCONTEXT + 3*4)(r30) - l.lwz r4, (UCONTEXT_MCONTEXT + 4*4)(r30) - l.lwz r5, (UCONTEXT_MCONTEXT + 5*4)(r30) - l.lwz r6, (UCONTEXT_MCONTEXT + 6*4)(r30) - l.lwz r7, (UCONTEXT_MCONTEXT + 7*4)(r30) - l.lwz r8, (UCONTEXT_MCONTEXT + 8*4)(r30) - - /* Restore registers stored in getcontext. */ - l.lwz r1, (UCONTEXT_MCONTEXT + 1*4)(r30) - l.lwz r2, (UCONTEXT_MCONTEXT + 2*4)(r30) - l.lwz r9, (UCONTEXT_MCONTEXT + 9*4)(r30) - l.lwz r10, (UCONTEXT_MCONTEXT + 10*4)(r30) - l.lwz r11, (UCONTEXT_MCONTEXT + 11*4)(r30) - l.lwz r14, (UCONTEXT_MCONTEXT + 14*4)(r30) - l.lwz r16, (UCONTEXT_MCONTEXT + 16*4)(r30) - l.lwz r18, (UCONTEXT_MCONTEXT + 18*4)(r30) - l.lwz r20, (UCONTEXT_MCONTEXT + 20*4)(r30) - l.lwz r22, (UCONTEXT_MCONTEXT + 22*4)(r30) - l.lwz r24, (UCONTEXT_MCONTEXT + 24*4)(r30) - l.lwz r26, (UCONTEXT_MCONTEXT + 26*4)(r30) - l.lwz r28, (UCONTEXT_MCONTEXT + 28*4)(r30) - l.lwz r30, (UCONTEXT_MCONTEXT + 30*4)(r30) - - l.jr r11 - l.ori r11, r0, 0 - -1: l.j __syscall_error - l.ori r3, r11, 0 - -END (__swapcontext) -weak_alias (__swapcontext, swapcontext) +#endif diff --git a/sysdeps/unix/sysv/linux/or1k/sys/ucontext.h b/sysdeps/unix/sysv/linux/or1k/sys/ucontext.h index b17e919154..1b428592ee 100644 --- a/sysdeps/unix/sysv/linux/or1k/sys/ucontext.h +++ b/sysdeps/unix/sysv/linux/or1k/sys/ucontext.h @@ -38,6 +38,7 @@ typedef struct unsigned long int __gprs[__NGREG]; unsigned long int __pc; unsigned long int __sr; + unsigned long int __fpcsr; } mcontext_t; /* Userlevel context. */ diff --git a/sysdeps/unix/sysv/linux/or1k/ucontext_i.sym b/sysdeps/unix/sysv/linux/or1k/ucontext_i.sym index a8d4db080f..45cd72527d 100644 --- a/sysdeps/unix/sysv/linux/or1k/ucontext_i.sym +++ b/sysdeps/unix/sysv/linux/or1k/ucontext_i.sym @@ -13,6 +13,7 @@ _NSIG8 (_NSIG / 8) -- Offsets of the fields in the ucontext_t structure. #define ucontext(member) offsetof (ucontext_t, member) #define stack(member) ucontext (uc_stack.member) +#define mcontext(member) ucontext (uc_mcontext.member) UCONTEXT_LINK ucontext (uc_link) UCONTEXT_STACK ucontext (uc_stack) @@ -23,4 +24,6 @@ STACK_SP stack (ss_sp) STACK_SIZE stack (ss_size) STACK_FLAGS stack (ss_flags) +MCONTEXT_FPCSR mcontext (__fpcsr) + UCONTEXT_SIZE sizeof (ucontext_t) diff --git a/sysdeps/unix/sysv/linux/pidfd_getpid.c b/sysdeps/unix/sysv/linux/pidfd_getpid.c index 8567b413dd..6967477fab 100644 --- a/sysdeps/unix/sysv/linux/pidfd_getpid.c +++ b/sysdeps/unix/sysv/linux/pidfd_getpid.c @@ -74,8 +74,10 @@ parse_fdinfo (const char *l, void *arg) /* Ignore invalid large values. */ if (INT_MULTIPLY_WRAPV (10, n, &n) - || INT_ADD_WRAPV (n, *l++ - '0', &n)) + || INT_ADD_WRAPV (n, *l - '0', &n)) return -1; + + l++; } /* -1 indicates that the process is terminated. */ @@ -105,7 +107,7 @@ pidfd_getpid (int fd) *_fitoa_word (fd, p, 10, 0) = '\0'; struct parse_fdinfo_t fdinfo = { .found = false, .pid = -1 }; - if (!procutils_read_file (fdinfoname, parse_fdinfo, &fdinfo)) + if (!__libc_procutils_read_file (fdinfoname, parse_fdinfo, &fdinfo)) /* The fdinfo contains an invalid 'Pid:' value. */ return INLINE_SYSCALL_ERROR_RETURN_VALUE (EBADF); diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libm.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libm.abilist index 4f88e0af9c..564eb87d4b 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libm.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libm.abilist @@ -995,3 +995,9 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libm.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libm.abilist index edc26140dc..a57eedb779 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libm.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libm.abilist @@ -994,3 +994,9 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libm.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libm.abilist index 0a8a1433d7..7564069a37 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libm.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libm.abilist @@ -988,3 +988,9 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S index 6fdb4a6073..900c354c9c 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S @@ -154,4 +154,3 @@ L(parent): PSEUDO_END (__clone3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libm.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libm.abilist index 5174d20032..b02a7115af 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libm.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libm.abilist @@ -1321,3 +1321,12 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 __log2p1ieee128 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/procutils.c b/sysdeps/unix/sysv/linux/procutils.c index 25666ec0cb..86d3d37329 100644 --- a/sysdeps/unix/sysv/linux/procutils.c +++ b/sysdeps/unix/sysv/linux/procutils.c @@ -71,8 +71,9 @@ next_line (char **r, int fd, char *const buffer, char **cp, char **re, } bool -procutils_read_file (const char *filename, procutils_closure_t closure, - void *arg) +__libc_procutils_read_file (const char *filename, + procutils_closure_t closure, + void *arg) { enum { buffer_size = PROCUTILS_MAX_LINE_LEN }; char buffer[buffer_size]; diff --git a/sysdeps/unix/sysv/linux/procutils.h b/sysdeps/unix/sysv/linux/procutils.h index 73ef3e5f5b..acf1ec587a 100644 --- a/sysdeps/unix/sysv/linux/procutils.h +++ b/sysdeps/unix/sysv/linux/procutils.h @@ -37,7 +37,8 @@ typedef int (*procutils_closure_t) (const char *line, void *arg); It returns true in case the file is fully read or false if CLOSURE returns a value diferent than 0. */ -bool procutils_read_file (const char *filename, procutils_closure_t closure, - void *arg) attribute_hidden; +bool __libc_procutils_read_file (const char *filename, + procutils_closure_t closure, + void *arg) attribute_hidden; #endif diff --git a/sysdeps/unix/sysv/linux/riscv/clone3.S b/sysdeps/unix/sysv/linux/riscv/clone3.S index 29264be054..c81ee2ab51 100644 --- a/sysdeps/unix/sysv/linux/riscv/clone3.S +++ b/sysdeps/unix/sysv/linux/riscv/clone3.S @@ -76,4 +76,3 @@ L(thread_start): END(__thread_start_clone3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/libm.abilist b/sysdeps/unix/sysv/linux/riscv/rv32/libm.abilist index 45ca5c677a..7bf0bd7c7a 100644 --- a/sysdeps/unix/sysv/linux/riscv/rv32/libm.abilist +++ b/sysdeps/unix/sysv/linux/riscv/rv32/libm.abilist @@ -1028,3 +1028,11 @@ GLIBC_2.35 fminimumf64x F GLIBC_2.35 fminimuml F GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/libm.abilist b/sysdeps/unix/sysv/linux/riscv/rv64/libm.abilist index 4838edf991..c22d9bb7c6 100644 --- a/sysdeps/unix/sysv/linux/riscv/rv64/libm.abilist +++ b/sysdeps/unix/sysv/linux/riscv/rv64/libm.abilist @@ -1125,3 +1125,11 @@ GLIBC_2.35 fminimumf64x F GLIBC_2.35 fminimuml F GLIBC_2.35 fsqrt F GLIBC_2.35 fsqrtl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/Makefile b/sysdeps/unix/sysv/linux/s390/s390-32/Makefile index 6b6d59b7dd..2a5b4fbb6f 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/Makefile +++ b/sysdeps/unix/sysv/linux/s390/s390-32/Makefile @@ -3,6 +3,7 @@ default-abi := 32 ifeq ($(subdir),login) sysdep_routines += utmp32 utmpx32 login32 +shared-only-routines += utmp32 utmpx32 login32 endif ifeq ($(subdir),misc) diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/clone3.S b/sysdeps/unix/sysv/linux/s390/s390-32/clone3.S index d344c2cf36..1ea633aabb 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/clone3.S +++ b/sysdeps/unix/sysv/linux/s390/s390-32/clone3.S @@ -75,4 +75,3 @@ thread_start: ASM_SIZE_DIRECTIVE (thread_start) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/libm.abilist b/sysdeps/unix/sysv/linux/s390/s390-32/libm.abilist index 5ff11fb54f..2fb712ec71 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/libm.abilist +++ b/sysdeps/unix/sysv/linux/s390/s390-32/libm.abilist @@ -1252,3 +1252,11 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S b/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S index ca382d903c..70ae7f1532 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S +++ b/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S @@ -75,4 +75,3 @@ thread_start: ASM_SIZE_DIRECTIVE (thread_start) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/libm.abilist b/sysdeps/unix/sysv/linux/s390/s390-64/libm.abilist index 0e246c2c54..40489589a7 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-64/libm.abilist +++ b/sysdeps/unix/sysv/linux/s390/s390-64/libm.abilist @@ -1252,3 +1252,11 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/sh/be/libm.abilist b/sysdeps/unix/sysv/linux/sh/be/libm.abilist index 7b43a866e2..a5363e90d6 100644 --- a/sysdeps/unix/sysv/linux/sh/be/libm.abilist +++ b/sysdeps/unix/sysv/linux/sh/be/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F GLIBC_2.4 exp2l F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/sh/le/libm.abilist b/sysdeps/unix/sysv/linux/sh/le/libm.abilist index 7b43a866e2..a5363e90d6 100644 --- a/sysdeps/unix/sysv/linux/sh/le/libm.abilist +++ b/sysdeps/unix/sysv/linux/sh/le/libm.abilist @@ -848,3 +848,9 @@ GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F GLIBC_2.4 exp2l F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/libm.abilist b/sysdeps/unix/sysv/linux/sparc/sparc32/libm.abilist index e3dcf3d4e7..9bda9bdeb5 100644 --- a/sysdeps/unix/sysv/linux/sparc/sparc32/libm.abilist +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/libm.abilist @@ -1259,3 +1259,11 @@ GLIBC_2.4 truncl F GLIBC_2.4 y0l F GLIBC_2.4 y1l F GLIBC_2.4 ynl F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/libm.abilist b/sysdeps/unix/sysv/linux/sparc/sparc64/libm.abilist index 20fef20c8b..61d2aa05a9 100644 --- a/sysdeps/unix/sysv/linux/sparc/sparc64/libm.abilist +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/libm.abilist @@ -1149,3 +1149,11 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/sys/epoll.h b/sysdeps/unix/sysv/linux/sys/epoll.h index fc8dce45c8..45e546fa44 100644 --- a/sysdeps/unix/sysv/linux/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/sys/epoll.h @@ -19,6 +19,7 @@ #define _SYS_EPOLL_H 1 #include <stdint.h> +#include <sys/ioctl.h> #include <sys/types.h> #include <bits/types/sigset_t.h> @@ -87,6 +88,19 @@ struct epoll_event epoll_data_t data; /* User data variable */ } __EPOLL_PACKED; +struct epoll_params +{ + uint32_t busy_poll_usecs; + uint16_t busy_poll_budget; + uint8_t prefer_busy_poll; + + /* pad the struct to a multiple of 64bits */ + uint8_t __pad; +}; + +#define EPOLL_IOC_TYPE 0x8A +#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params) +#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) __BEGIN_DECLS diff --git a/sysdeps/unix/sysv/linux/sys/pidfd.h b/sysdeps/unix/sysv/linux/sys/pidfd.h index 1078322062..9f88d297e8 100644 --- a/sysdeps/unix/sysv/linux/sys/pidfd.h +++ b/sysdeps/unix/sysv/linux/sys/pidfd.h @@ -22,12 +22,14 @@ #include <bits/types/siginfo_t.h> #define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_THREAD O_EXCL -/* Returns a file descriptor that refers to the process PID. The - close-on-exec is set on the file descriptor. +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1) +#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2) - The FLAGS argument is reserved for future use, it must be specified - as 0. */ +/* Returns a file descriptor that refers to the process PID. The + close-on-exec is set on the file descriptor. */ extern int pidfd_open (__pid_t __pid, unsigned int __flags) __THROW; /* Duplicates an existing file descriptor TARGETFD in the process referred @@ -39,10 +41,7 @@ extern int pidfd_getfd (int __pidfd, int __targetfd, unsigned int __flags) __THROW; /* Sends the signal SIG to the target process referred by the PIDFD. If - INFO points to a siginfo_t buffer, it will be populated. - - The FLAGS argument is reserved for future use, it must be specified - as 0. */ + INFO points to a siginfo_t buffer, it will be populated. */ extern int pidfd_send_signal (int __pidfd, int __sig, siginfo_t *__info, unsigned int __flags) __THROW; diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list index 6557bcfde4..672d39eaad 100644 --- a/sysdeps/unix/sysv/linux/syscall-names.list +++ b/sysdeps/unix/sysv/linux/syscall-names.list @@ -21,8 +21,8 @@ # This file can list all potential system calls. The names are only # used if the installed kernel headers also provide them. -# The list of system calls is current as of Linux 6.8. -kernel 6.8 +# The list of system calls is current as of Linux 6.9. +kernel 6.9 FAST_atomic_update FAST_cmpxchg diff --git a/sysdeps/unix/sysv/linux/timespec_get.c b/sysdeps/unix/sysv/linux/timespec_get.c index c6e5e66289..778d1e3354 100644 --- a/sysdeps/unix/sysv/linux/timespec_get.c +++ b/sysdeps/unix/sysv/linux/timespec_get.c @@ -5,7 +5,7 @@ The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either - version 2.1 of the License. + version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/sysdeps/unix/sysv/linux/timespec_getres.c b/sysdeps/unix/sysv/linux/timespec_getres.c index 5acebe2a2c..2eef9e512c 100644 --- a/sysdeps/unix/sysv/linux/timespec_getres.c +++ b/sysdeps/unix/sysv/linux/timespec_getres.c @@ -5,7 +5,7 @@ The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either - version 2.1 of the License. + version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/sysdeps/unix/sysv/linux/tst-epoll-ioctls.c b/sysdeps/unix/sysv/linux/tst-epoll-ioctls.c new file mode 100644 index 0000000000..618ecc4e86 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-epoll-ioctls.c @@ -0,0 +1,92 @@ +/* Basic tests for Linux epoll ioctls. + Copyright (C) 2022-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <intprops.h> +#include <stdlib.h> +#include <string.h> +#include <support/check.h> +#include <support/process_state.h> +#include <support/support.h> +#include <support/test-driver.h> +#include <support/xsignal.h> +#include <support/xunistd.h> +#include <sys/ioctl.h> +#include <sys/epoll.h> + +static void +test_epoll_ioctl (void) +{ + int efd = epoll_create1 (0); + TEST_VERIFY_EXIT (efd != -1); + + struct epoll_params params; + + TEST_COMPARE (ioctl (efd, EPIOCGPARAMS, ¶ms), 0); + + /* parameters are all 0 by default */ + TEST_COMPARE (params.busy_poll_usecs, 0); + TEST_COMPARE (params.busy_poll_budget, 0); + TEST_COMPARE (params.prefer_busy_poll, 0); + TEST_COMPARE (params.__pad, 0); + + /* set custom parameters */ + params.busy_poll_usecs = 40; + params.busy_poll_budget = 8; + params.prefer_busy_poll = 1; + params.__pad = 0; + + TEST_COMPARE (ioctl (efd, EPIOCSPARAMS, ¶ms), 0); + + memset (¶ms, 0, sizeof (params)); + + TEST_COMPARE (ioctl (efd, EPIOCGPARAMS, ¶ms), 0); + + /* check custom values were retrieved after being set */ + TEST_COMPARE (params.busy_poll_usecs, 40); + TEST_COMPARE (params.busy_poll_budget, 8); + TEST_COMPARE (params.prefer_busy_poll, 1); + TEST_COMPARE (params.__pad, 0); + + xclose (efd); +} + +static bool +ioctl_supported (void) +{ + int efd = epoll_create1 (0); + TEST_VERIFY_EXIT (efd != -1); + + struct epoll_params params; + int r = ioctl (efd, EPIOCGPARAMS, ¶ms); + xclose (efd); + + return (r == 0); +} + +static int +do_test (void) +{ + if (ioctl_supported ()) + test_epoll_ioctl (); + else + return EXIT_UNSUPPORTED; + + return 0; +} + +#include <support/test-driver.c> diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py index 56c0cf3e22..4a8f4e8919 100644 --- a/sysdeps/unix/sysv/linux/tst-mman-consts.py +++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py @@ -33,7 +33,7 @@ def main(): help='C compiler (including options) to use') args = parser.parse_args() linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) - linux_version_glibc = (6, 8) + linux_version_glibc = (6, 9) sys.exit(glibcextract.compare_macro_consts( '#define _GNU_SOURCE 1\n' '#include <sys/mman.h>\n', diff --git a/sysdeps/unix/sysv/linux/tst-mount-consts.py b/sysdeps/unix/sysv/linux/tst-mount-consts.py index 8613db96d7..c4a67221c1 100755 --- a/sysdeps/unix/sysv/linux/tst-mount-consts.py +++ b/sysdeps/unix/sysv/linux/tst-mount-consts.py @@ -39,10 +39,10 @@ def main(): sys.exit (77) linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) - # Constants in glibc were updated to match Linux v6.8. When glibc + # Constants in glibc were updated to match Linux v6.9. When glibc # constants are updated this value should be updated to match the # released kernel version from which the constants were taken. - linux_version_glibc = (6, 8) + linux_version_glibc = (6, 9) def check(cte, exclude=None): return glibcextract.compare_macro_consts( '#include <sys/mount.h>\n', diff --git a/sysdeps/unix/sysv/linux/tst-pidfd-consts.py b/sysdeps/unix/sysv/linux/tst-pidfd-consts.py index 96875ac266..6f05291949 100644 --- a/sysdeps/unix/sysv/linux/tst-pidfd-consts.py +++ b/sysdeps/unix/sysv/linux/tst-pidfd-consts.py @@ -39,7 +39,7 @@ def main(): sys.exit (77) linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) - linux_version_glibc = (6, 8) + linux_version_glibc = (6, 9) sys.exit(glibcextract.compare_macro_consts( '#include <sys/pidfd.h>\n', '#include <asm/fcntl.h>\n' diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist index c1c5c76e26..bbf646fe7f 100644 --- a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist @@ -1182,3 +1182,11 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S index 5a2d42234e..4cc19e066c 100644 --- a/sysdeps/unix/sysv/linux/x86_64/clone3.S +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S @@ -86,4 +86,3 @@ L(thread_start): PSEUDO_END (__clone3) libc_hidden_def (__clone3) -weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/libm.abilist b/sysdeps/unix/sysv/linux/x86_64/x32/libm.abilist index fac219d45a..7c7b884dde 100644 --- a/sysdeps/unix/sysv/linux/x86_64/x32/libm.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/x32/libm.abilist @@ -1182,3 +1182,11 @@ GLIBC_2.35 hypotf F GLIBC_2.38 fmod F GLIBC_2.38 fmodf F GLIBC_2.39 exp10 F +GLIBC_2.40 log2p1 F +GLIBC_2.40 log2p1f F +GLIBC_2.40 log2p1f128 F +GLIBC_2.40 log2p1f32 F +GLIBC_2.40 log2p1f32x F +GLIBC_2.40 log2p1f64 F +GLIBC_2.40 log2p1f64x F +GLIBC_2.40 log2p1l F diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h index ab73556772..83491607c7 100644 --- a/sysdeps/x86/cacheinfo.h +++ b/sysdeps/x86/cacheinfo.h @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; -/* Threshold to use non temporal store. */ +/* Threshold to use non temporal store in memmove. */ long int __x86_shared_non_temporal_threshold attribute_hidden; +/* Threshold to use non temporal store in memset. */ +long int __x86_memset_non_temporal_threshold attribute_hidden; + /* Threshold to use Enhanced REP MOVSB. */ long int __x86_rep_movsb_threshold attribute_hidden = 2048; @@ -77,6 +80,9 @@ init_cacheinfo (void) __x86_shared_non_temporal_threshold = cpu_features->non_temporal_threshold; + __x86_memset_non_temporal_threshold + = cpu_features->memset_non_temporal_threshold; + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; diff --git a/sysdeps/x86/configure b/sysdeps/x86/configure index d28d9bcb29..04c6ba3e6c 100644 --- a/sysdeps/x86/configure +++ b/sysdeps/x86/configure @@ -139,8 +139,12 @@ libc_cv_have_x86_isa_level=4 libc_cv_have_x86_isa_level=3 #elif MINIMUM_X86_ISA_LEVEL == 2 libc_cv_have_x86_isa_level=2 -#else +#elif defined __x86_64__ libc_cv_have_x86_isa_level=baseline +#elif MINIMUM_X86_ISA_LEVEL == 1 +libc_cv_have_x86_isa_level=1 +#else +libc_cv_have_x86_isa_level=0 #endif EOF eval `${CC-cc} $CFLAGS $CPPFLAGS $ISAFLAG -I$srcdir -E conftest.c | grep libc_cv_have_x86_isa_level` @@ -148,8 +152,10 @@ EOF fi { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_have_x86_isa_level" >&5 printf "%s\n" "$libc_cv_have_x86_isa_level" >&6; } -else +elif test $base_machine = x86_64; then libc_cv_have_x86_isa_level=baseline +else + libc_cv_have_x86_isa_level=0 fi if test $libc_cv_have_x86_isa_level = baseline; then printf "%s\n" "#define MINIMUM_X86_ISA_LEVEL 1" >>confdefs.h diff --git a/sysdeps/x86/configure.ac b/sysdeps/x86/configure.ac index 5b0acd03d2..8a259d3971 100644 --- a/sysdeps/x86/configure.ac +++ b/sysdeps/x86/configure.ac @@ -96,14 +96,20 @@ libc_cv_have_x86_isa_level=4 libc_cv_have_x86_isa_level=3 #elif MINIMUM_X86_ISA_LEVEL == 2 libc_cv_have_x86_isa_level=2 -#else +#elif defined __x86_64__ libc_cv_have_x86_isa_level=baseline +#elif MINIMUM_X86_ISA_LEVEL == 1 +libc_cv_have_x86_isa_level=1 +#else +libc_cv_have_x86_isa_level=0 #endif EOF eval `${CC-cc} $CFLAGS $CPPFLAGS $ISAFLAG -I$srcdir -E conftest.c | grep libc_cv_have_x86_isa_level` rm -rf conftest*]) -else +elif test $base_machine = x86_64; then libc_cv_have_x86_isa_level=baseline +else + libc_cv_have_x86_isa_level=0 fi if test $libc_cv_have_x86_isa_level = baseline; then AC_DEFINE_UNQUOTED(MINIMUM_X86_ISA_LEVEL, 1) diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 5a98f70364..d2fe61b997 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; + /* Non-temporal stores are more performant on Intel and AMD hardware above + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (cpu_features->basic.kind == arch_kind_intel + || cpu_features->basic.kind == arch_kind_amd) + memset_non_temporal_threshold = non_temporal_threshold; + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of cases slower than the vectorized path (and for some alignments, it is really slow, check BZ #30994). */ @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) && tunable_size <= maximum_non_temporal_threshold) non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) + memset_non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); if (tunable_size > minimum_rep_movsb_threshold) rep_movsb_threshold = tunable_size; @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS ( + x86_memset_non_temporal_threshold, memset_non_temporal_threshold, + minimum_non_temporal_threshold, maximum_non_temporal_threshold); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; cpu_features->rep_movsb_threshold = rep_movsb_threshold; cpu_features->rep_stosb_threshold = rep_stosb_threshold; cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index ceafde9481..49eeb5f70a 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void) cpu_features->shared_cache_size); print_cpu_features_value ("non_temporal_threshold", cpu_features->non_temporal_threshold); + print_cpu_features_value ("memset_non_temporal_threshold", + cpu_features->memset_non_temporal_threshold); print_cpu_features_value ("rep_movsb_threshold", cpu_features->rep_movsb_threshold); print_cpu_features_value ("rep_movsb_stop_threshold", diff --git a/sysdeps/x86/dl-get-cpu-features.c b/sysdeps/x86/dl-get-cpu-features.c index 4d6c5c59a6..579d02d638 100644 --- a/sysdeps/x86/dl-get-cpu-features.c +++ b/sysdeps/x86/dl-get-cpu-features.c @@ -64,6 +64,11 @@ Fatal glibc error: CPU does not support x86-64-v%d\n", 4); # endif /* ISA level 4 */ # endif /* ISA level 3 */ # endif /* ISA level 2 */ +# ifdef GCCMACRO__APX_F__ + if (!CPU_FEATURE_USABLE_P (cpu_features, APX_F)) + _dl_fatal_printf ("\ +Fatal glibc error: CPU does not support APX\n"); +# endif # endif /* IS_IN (rtld) */ } } diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index 7d82da0dec..a0a1299592 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -30,6 +30,9 @@ glibc { x86_non_temporal_threshold { type: SIZE_T } + x86_memset_non_temporal_threshold { + type: SIZE_T + } x86_rep_movsb_threshold { type: SIZE_T # Since there is overhead to set up REP MOVSB operation, REP diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index cd7bd27cf3..aaae44f0e1 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -944,8 +944,10 @@ struct cpu_features /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ unsigned long int shared_cache_size; - /* Threshold to use non temporal store. */ + /* Threshold to use non temporal store in memmove. */ unsigned long int non_temporal_threshold; + /* Threshold to use non temporal store in memset. */ + unsigned long int memset_non_temporal_threshold; /* Threshold to use "rep movsb". */ unsigned long int rep_movsb_threshold; /* Threshold to stop using "rep movsb". */ diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h index 2c7f74212b..03c1fe2bf5 100644 --- a/sysdeps/x86/isa-level.h +++ b/sysdeps/x86/isa-level.h @@ -35,7 +35,17 @@ # define __X86_ISA_V1 0 #endif -#if __X86_ISA_V1 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 \ +#ifdef __x86_64__ +# ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 +# define __GCC_HAVE_SYNC_COMPARE_AND_SWAP +# endif +#else +# ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 +# define __GCC_HAVE_SYNC_COMPARE_AND_SWAP +# endif +#endif + +#if __X86_ISA_V1 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP \ && defined HAVE_X86_LAHF_SAHF && defined __POPCNT__ && defined __SSE3__ \ && defined __SSSE3__ && defined __SSE4_1__ && defined __SSE4_2__ /* NB: ISAs in x86-64 ISA level v2 are used. */ diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c index e270c29db7..a50afea2f9 100644 --- a/sysdeps/x86/tst-cpu-features-supports.c +++ b/sysdeps/x86/tst-cpu-features-supports.c @@ -65,7 +65,7 @@ do_test (int argc, char **argv) #endif fails += CHECK_FEATURE_ACTIVE (avx, AVX); fails += CHECK_FEATURE_ACTIVE (avx2, AVX2); -#if __GNUC_PREREQ (7, 0) +#if __GNUC_PREREQ (7, 0) && !__GNUC_PREREQ (15, 0) fails += CHECK_FEATURE_ACTIVE (avx5124fmaps, AVX512_4FMAPS); fails += CHECK_FEATURE_ACTIVE (avx5124vnniw, AVX512_4VNNIW); #endif @@ -92,14 +92,18 @@ do_test (int argc, char **argv) #if __GNUC_PREREQ (6, 0) fails += CHECK_FEATURE_ACTIVE (avx512bw, AVX512BW); fails += CHECK_FEATURE_ACTIVE (avx512cd, AVX512CD); +# if !__GNUC_PREREQ (15, 0) fails += CHECK_FEATURE_ACTIVE (avx512er, AVX512ER); +# endif fails += CHECK_FEATURE_ACTIVE (avx512dq, AVX512DQ); #endif #if __GNUC_PREREQ (5, 0) fails += CHECK_FEATURE_ACTIVE (avx512f, AVX512F); #endif #if __GNUC_PREREQ (6, 0) +# if !__GNUC_PREREQ (15, 0) fails += CHECK_FEATURE_ACTIVE (avx512pf, AVX512PF); +# endif fails += CHECK_FEATURE_ACTIVE (avx512vl, AVX512VL); #endif #if __GNUC_PREREQ (5, 0) @@ -148,7 +152,9 @@ do_test (int argc, char **argv) #endif fails += CHECK_FEATURE_ACTIVE (popcnt, POPCNT); #if __GNUC_PREREQ (11, 0) +# if !__GNUC_PREREQ (15, 0) fails += CHECK_FEATURE_ACTIVE (prefetchwt1, PREFETCHWT1); +# endif fails += CHECK_FEATURE_ACTIVE (ptwrite, PTWRITE); fails += CHECK_FEATURE_ACTIVE (rdpid, RDPID); fails += CHECK_FEATURE_ACTIVE (rdrnd, RDRAND); diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c index f6a65b88de..8589a9fd66 100644 --- a/sysdeps/x86/tst-hwcap-tunables.c +++ b/sysdeps/x86/tst-hwcap-tunables.c @@ -133,7 +133,7 @@ do_test (int argc, char *argv[]) setenv ("GLIBC_TUNABLES", tunable, 1); struct support_capture_subprocess result - = support_capture_subprogram (spargv[0], spargv); + = support_capture_subprogram (spargv[0], spargv, NULL); support_capture_subprocess_check (&result, "tst-tunables", 0, sc_allow_stderr); support_capture_subprocess_free (&result); diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index ff5d45f7cb..a6de3793e4 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -245,10 +245,11 @@ elf_machine_plt_value (struct link_map *map, const ElfW(Rela) *reloc, MAP is the object containing the reloc. */ static inline void __attribute__((always_inline)) -elf_machine_rela(struct link_map *map, struct r_scope_elem *scope[], - const ElfW(Rela) *reloc, const ElfW(Sym) *sym, - const struct r_found_version *version, - void *const reloc_addr_arg, int skip_ifunc) { +elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], + const ElfW(Rela) *reloc, const ElfW(Sym) *sym, + const struct r_found_version *version, + void *const reloc_addr_arg, int skip_ifunc) +{ ElfW(Addr) *const reloc_addr = reloc_addr_arg; const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index e0015347d0..893c22b19b 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -1847,6 +1847,30 @@ float: 1 Function: "log2_vlen8_avx2": float: 1 +Function: "log2p1": +double: 1 +float: 1 +float128: 3 +ldouble: 2 + +Function: "log2p1_downward": +double: 2 +float: 2 +float128: 3 +ldouble: 4 + +Function: "log2p1_towardzero": +double: 2 +float: 2 +float128: 2 +ldouble: 4 + +Function: "log2p1_upward": +double: 1 +float: 2 +float128: 2 +ldouble: 5 + Function: "log_downward": float: 2 float128: 1 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 97839a2248..88bf08e4f4 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -21,10 +21,13 @@ 2. If size is less than VEC, use integer register stores. 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. - 5. On machines ERMS feature, if size is greater or equal than - __x86_rep_stosb_threshold then REP STOSB will be used. - 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with - 4 VEC stores and store 4 * VEC at a time until done. */ + 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. + 6. On machines ERMS feature, if size is range + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) + then REP STOSB will be used. + 7. If size >= __x86_memset_non_temporal_threshold, use a + non-temporal stores. */ #include <sysdep.h> @@ -147,6 +150,41 @@ L(entry_from_wmemset): VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VMM(0), (%rdi) VZEROUPPER_RETURN + + /* If have AVX512 mask instructions put L(less_vec) close to + entry as it doesn't take much space and is likely a hot target. */ +#ifdef USE_LESS_VEC_MASK_STORE + /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */ + .p2align 6,, 47 + .p2align 4 +L(less_vec): +L(less_vec_from_wmemset): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. Note that we are using rax which is set in + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ + andl $(PAGE_SIZE - 1), %edi + /* Check if VEC_SIZE store cross page. Mask stores suffer + serious performance degradation when it has to fault suppress. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %edi + /* This is generally considered a cold target. */ + ja L(cross_page) +# if VEC_SIZE > 32 + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx + kmovq %rcx, %k1 +# else + movl $-1, %ecx + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k1 +# endif + vmovdqu8 %VMM(0), (%rax){%k1} + VZEROUPPER_RETURN +#endif + #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -185,54 +223,6 @@ L(last_2x_vec): #endif VZEROUPPER_RETURN - /* If have AVX512 mask instructions put L(less_vec) close to - entry as it doesn't take much space and is likely a hot target. - */ -#ifdef USE_LESS_VEC_MASK_STORE - .p2align 4,, 10 -L(less_vec): -L(less_vec_from_wmemset): - /* Less than 1 VEC. */ -# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 -# error Unsupported VEC_SIZE! -# endif - /* Clear high bits from edi. Only keeping bits relevant to page - cross check. Note that we are using rax which is set in - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ - andl $(PAGE_SIZE - 1), %edi - /* Check if VEC_SIZE store cross page. Mask stores suffer - serious performance degradation when it has to fault suppress. - */ - cmpl $(PAGE_SIZE - VEC_SIZE), %edi - /* This is generally considered a cold target. */ - ja L(cross_page) -# if VEC_SIZE > 32 - movq $-1, %rcx - bzhiq %rdx, %rcx, %rcx - kmovq %rcx, %k1 -# else - movl $-1, %ecx - bzhil %edx, %ecx, %ecx - kmovd %ecx, %k1 -# endif - vmovdqu8 %VMM(0), (%rax){%k1} - VZEROUPPER_RETURN - -# if defined USE_MULTIARCH && IS_IN (libc) - /* Include L(stosb_local) here if including L(less_vec) between - L(stosb_more_2x_vec) and ENTRY. This is to cache align the - L(stosb_more_2x_vec) target. */ - .p2align 4,, 10 -L(stosb_local): - movzbl %sil, %eax - mov %RDX_LP, %RCX_LP - mov %RDI_LP, %RDX_LP - rep stosb - mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN -# endif -#endif - #if defined USE_MULTIARCH && IS_IN (libc) .p2align 4 L(stosb_more_2x_vec): @@ -318,21 +308,33 @@ L(return_vzeroupper): ret #endif - .p2align 4,, 10 -#ifndef USE_LESS_VEC_MASK_STORE -# if defined USE_MULTIARCH && IS_IN (libc) +#ifdef USE_WITH_AVX2 + .p2align 4 +#else + .p2align 4,, 4 +#endif + +#if defined USE_MULTIARCH && IS_IN (libc) /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in range for 2-byte jump encoding. */ L(stosb_local): + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP + jae L(nt_memset) movzbl %sil, %eax mov %RDX_LP, %RCX_LP mov %RDI_LP, %RDX_LP rep stosb +# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512) + /* Use xchg to save 1-byte (this helps align targets below). */ + xchg %RDX_LP, %RAX_LP +# else mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN # endif + VZEROUPPER_RETURN +#endif +#ifndef USE_LESS_VEC_MASK_STORE /* Define L(less_vec) only if not otherwise defined. */ - .p2align 4 + .p2align 4,, 12 L(less_vec): /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to xmm). This is only does anything for AVX2. */ @@ -423,4 +425,35 @@ L(between_2_3): movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) #endif ret -END (MEMSET_SYMBOL (__memset, unaligned_erms)) + +#if defined USE_MULTIARCH && IS_IN (libc) +# ifdef USE_WITH_AVX512 + /* Force align so the loop doesn't cross a cache-line. */ + .p2align 4 +# endif + .p2align 4,, 7 + /* Memset using non-temporal stores. */ +L(nt_memset): + VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi) + leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx + /* Align DST. */ + orq $(VEC_SIZE * 1 - 1), %rdi + incq %rdi + .p2align 4,, 7 +L(nt_loop): + VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi) + subq $(VEC_SIZE * -4), %rdi + cmpq %rdx, %rdi + jb L(nt_loop) + sfence + VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx) + VZEROUPPER_RETURN +#endif + +END(MEMSET_SYMBOL(__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S index 392215950a..10bfb0a531 100644 --- a/sysdeps/x86_64/multiarch/wcsncat-evex.S +++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S @@ -1,9 +1,9 @@ -#ifndef WCSCAT -# define WCSCAT __wcsncat_evex +#ifndef WCSNCAT +# define WCSNCAT __wcsncat_evex #endif #define USE_AS_WCSCPY #define USE_AS_STRCAT -#define STRNCAT WCSCAT +#define STRNCAT WCSNCAT #include "strncat-evex.S" |