diff options
Diffstat (limited to 'sysdeps/aarch64/fpu/log2f_advsimd.c')
-rw-r--r-- | sysdeps/aarch64/fpu/log2f_advsimd.c | 38 |
1 files changed, 24 insertions, 14 deletions
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c index db21836749..84effe4fe9 100644 --- a/sysdeps/aarch64/fpu/log2f_advsimd.c +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c @@ -22,9 +22,9 @@ static const struct data { - uint32x4_t min_norm; + uint32x4_t off, offset_lower_bound; uint16x8_t special_bound; - uint32x4_t off, mantissa_mask; + uint32x4_t mantissa_mask; float32x4_t poly[9]; } data = { /* Coefficients generated using Remez algorithm approximate @@ -34,18 +34,22 @@ static const struct data V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ .off = V4 (0x3f2aaaab), /* 0.666667. */ .mantissa_mask = V4 (0x007fffff), }; static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, - uint16x4_t cmp) +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, + uint16x4_t cmp, const struct data *d) { /* Fall back to scalar code. */ - return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); } /* Fast implementation for single precision AdvSIMD log2, @@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); + u_off = vsubq_u32 (u_off, d->off); float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log2(1+r) + n. */ @@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, n, p, r, special); + return special_case (n, u_off, p, r, special, d); return vfmaq_f32 (n, p, r); } libmvec_hidden_def (V_NAME_F1 (log2)) |