39 files changed, 710 insertions, 701 deletions
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index cc15ce2d1e..015211f5f4 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -135,4 +135,11 @@ libmvec {
     _ZGVsMxv_tanh;
     _ZGVsMxv_tanhf;
   }
+  GLIBC_2.41 {
+    _ZGVnN2v_logp1;
+    _ZGVnN2v_logp1f;
+    _ZGVnN4v_logp1f;
+    _ZGVsMxv_logp1;
+    _ZGVsMxv_logp1f;
+  }
 }
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 8916dcbf40..004474acf9 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -25,35 +25,32 @@ const static struct data
 {
   struct v_log1pf_data log1pf_consts;
   uint32x4_t one;
-  uint16x4_t thresh;
-} data = {
-  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
-  .one = V4 (0x3f800000),
-  .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
 
 static float32x4_t NOINLINE VPCS_ATTR
 special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
-	      const struct v_log1pf_data d)
+	      const struct v_log1pf_data *d)
 {
   return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
 }
 
 /* Vector approximation for single-precision acosh, based on log1p. Maximum
    error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
-   is 2.78 ULP:
-   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
-			   want 0x1.ef9ea2p-3.
+   is 3.00 ULP:
+   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+				 want 0x1.ef0a7cp-4.
    With exceptions disabled, we can compute u with a shorter dependency chain,
-   which gives maximum error of 3.07 ULP:
-  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
-			   want 0x1.fbc7f4p-4.  */
+   which gives maximum error of 3.22 ULP:
+   _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+				 want 0x1.fdcdd2p-5.  */
 
 VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
 
 #if WANT_SIMD_EXCEPT
   /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
   float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
   float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
 #else
-  float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
-  float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  float32x4_t u
+      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
 #endif
 
   float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
 
   if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (x, y, special, d->log1pf_consts);
-  return log1pf_inline (y, d->log1pf_consts);
+    return special_case (x, y, special, &d->log1pf_consts);
+  return log1pf_inline (y, &d->log1pf_consts);
 }
 libmvec_hidden_def (V_NAME_F1 (acosh))
 HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 097d403ffe..5909bb4ce9 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -36,6 +36,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
 libmvec_hidden_proto (V_NAME_F1(log10));
 libmvec_hidden_proto (V_NAME_F1(log1p));
 libmvec_hidden_proto (V_NAME_F1(log2));
+libmvec_hidden_proto (V_NAME_F1(logp1));
 libmvec_hidden_proto (V_NAME_F1(log));
 libmvec_hidden_proto (V_NAME_F2(pow));
 libmvec_hidden_proto (V_NAME_F1(sin));
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
index 09fd8a6143..eb789b91b6 100644
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -20,16 +20,16 @@
 #include "v_math.h"
 #include "v_log1pf_inline.h"
 
-#define SignMask v_u32 (0x80000000)
-
 const static struct data
 {
   struct v_log1pf_data log1pf_consts;
+  float32x4_t one;
   uint32x4_t big_bound;
 #if WANT_SIMD_EXCEPT
   uint32x4_t tiny_bound;
 #endif
 } data = {
+  .one = V4 (1),
   .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
   .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
 #if WANT_SIMD_EXCEPT
@@ -38,20 +38,27 @@ const static struct data
 };
 
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+	      uint32x4_t special, const struct data *d)
 {
-  return v_call_f32 (asinhf, x, y, special);
+  return v_call_f32 (
+      asinhf, x,
+      vreinterpretq_f32_u32 (veorq_u32 (
+	  sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+      special);
 }
 
 /* Single-precision implementation of vector asinh(x), using vector log1p.
-   Worst-case error is 2.66 ULP, at roughly +/-0.25:
-   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+   Worst-case error is 2.59 ULP:
+   _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+				 want 0x1.d449c4p-3.  */
 VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
 {
   const struct data *dat = ptr_barrier (&data);
-  uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
-  float32x4_t ax = vreinterpretq_f32_u32 (iax);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
   uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
   float32x4_t special_arg = x;
 
 #if WANT_SIMD_EXCEPT
@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
   /* asinh(x) = log(x + sqrt(x * x + 1)).
      For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
   float32x4_t d
-      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
-  float32x4_t y = log1pf_inline (
-      vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
 
   if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
-  return vbslq_f32 (SignMask, x, y);
+    return special_case (special_arg, sign, y, special, dat);
+  return vreinterpretq_f32_u32 (veorq_u32 (
+      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
 }
 libmvec_hidden_def (V_NAME_F1 (asinh))
 HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
index ae488f7b54..818b6c92ad 100644
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -40,15 +40,17 @@ const static struct data
 #define Half v_u32 (0x3f000000)
 
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+	      uint32x4_t special)
 {
-  return v_call_f32 (atanhf, x, y, special);
+  return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+		     vmulq_f32 (halfsign, y), special);
 }
 
 /* Approximation for vector single-precision atanh(x) using modified log1p.
-   The maximum error is 3.08 ULP:
-   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
-			   want 0x1.ffcb82p-5.  */
+   The maximum error is 2.93 ULP:
+   _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+				want 0x1.f4dcf8p-5.  */
 VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
   uint32x4_t special = vcgeq_u32 (iax, d->one);
 #endif
 
-  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
-  y = log1pf_inline (y, d->log1pf_consts);
+  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+			     vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+  y = log1pf_inline (y, &d->log1pf_consts);
 
+  /* If exceptions not required, pass ax to special-case for shorter dependency
+     chain. If exceptions are required ax will have been zerofied, so have to
+     pass x.  */
   if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special);
+#else
+    return special_case (ax, halfsign, y, special);
+#endif
   return vmulq_f32 (halfsign, y);
 }
 libmvec_hidden_def (V_NAME_F1 (atanh))
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 7484150131..f295fe185d 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -113,6 +113,10 @@
 # define __DECL_SIMD_log2 __DECL_SIMD_aarch64
 # undef __DECL_SIMD_log2f
 # define __DECL_SIMD_log2f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_logp1
+# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_logp1f
+# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
 # undef __DECL_SIMD_pow
 # define __DECL_SIMD_pow __DECL_SIMD_aarch64
 # undef __DECL_SIMD_powf
@@ -180,6 +184,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
@@ -207,6 +212,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
@@ -239,6 +245,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
@@ -266,6 +273,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
index 3924c9ce44..11a89b1530 100644
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
@@ -22,7 +22,7 @@
 static const struct data
 {
   float64x2_t poly[7];
-  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
   /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
   .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@@ -30,11 +30,9 @@ static const struct data
 	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
 	    V2 (-0x1.9e9540300a1p-41) },
   .inv_pi = V2 (0x1.45f306dc9c883p-2),
-  .half_pi = V2 (0x1.921fb54442d18p+0),
   .pi_1 = V2 (0x1.921fb54442d18p+1),
   .pi_2 = V2 (0x1.1a62633145c06p-53),
   .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
   .range_val = V2 (0x1p23)
 };
 
@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
 #endif
 
   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
-  n = vsubq_f64 (n, v_f64 (0.5));
+  n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+  n = vsubq_f64 (n, v_f64 (0.5f));
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
   r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
index d0c285b03a..85a1b37373 100644
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -22,7 +22,7 @@
 static const struct data
 {
   float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
   /* 1.886 ulp error.  */
   .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,8 +33,6 @@ static const struct data
   .pi_3 = V4 (-0x1.ee59dap-49f),
 
   .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
-  .half_pi = V4 (0x1.921fb6p0f),
   .range_val = V4 (0x1p20f)
 };
 
@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
 #endif
 
   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
+  n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
   n = vsubq_f32 (n, v_f32 (0.5f));
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
index e5d8a299c6..7ad6efa0fc 100644
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -23,37 +23,42 @@
 static const struct data
 {
   struct sv_expf_data expf_consts;
-  uint32_t special_bound;
+  float special_bound;
 } data = {
   .expf_consts = SV_EXPF_DATA,
   /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
-  .special_bound = 0x42ad496c,
+  .special_bound = 0x1.5a92d8p+6,
 };
 
 static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+	      svbool_t pg)
 {
-  return sv_call_f32 (coshf, x, y, pg);
+  return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
+		      pg);
 }
 
 /* Single-precision vector cosh, using vector expf.
-   Maximum error is 1.89 ULP:
-   _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
-				  want 0x1.f00adcp+127.  */
+   Maximum error is 2.77 ULP:
+   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+				 want 0x1.e4594cp+2.  */
 svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svfloat32_t ax = svabs_x (pg, x);
-  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+  svbool_t special = svacge (pg, x, d->special_bound);
 
-  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
-  svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
-  svfloat32_t half_t = svmul_x (pg, t, 0.5);
-  svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
+     Note that x is passed to exp here, rather than |x|. This is to avoid using
+     destructive unary ABS for better register usage. However it means the
+     routine is not exactly symmetrical, as the exp helper is slightly less
+     accurate in the negative range.  */
+  svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
+  svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
+  svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
 
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+    return special_case (x, half_e, half_over_e, special);
 
-  return svadd_x (pg, half_t, half_over_t);
+  return svadd_x (svptrue_b32 (), half_e, half_over_e);
 }
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index e09b2f3b27..8aa3fa9c43 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -18,74 +18,83 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
+/* For x < -Thres, the result is subnormal and not handled correctly by
    FEXPA.  */
-#define SpecialBound 37.9
+#define Thres 37.9
 
 static const struct data
 {
-  float poly[5];
-  float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
+  float log2_10_lo, c0, c2, c4;
+  float c1, c3, log10_2;
+  float shift, log2_10_hi, thres;
 } data = {
   /* Coefficients generated using Remez algorithm with minimisation of relative
      error.
      rel error: 0x1.89dafa3p-24
      abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
      maxerr: 0.52 +0.5 ulp.  */
-  .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
-	    0x1.12b41ap-1f },
+  .c0 = 0x1.26bb16p+1f,
+  .c1 = 0x1.5350d2p+1f,
+  .c2 = 0x1.04744ap+1f,
+  .c3 = 0x1.2d8176p+0f,
+  .c4 = 0x1.12b41ap-1f,
   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
-  .shift = 0x1.903f8p17f,
+  .shift = 0x1.803f8p17f,
   .log10_2 = 0x1.a934fp+1,
   .log2_10_hi = 0x1.344136p-2,
   .log2_10_lo = -0x1.ec10cp-27,
-  .special_bound = SpecialBound,
+  .thres = Thres,
 };
 
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+static inline svfloat32_t
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
 {
-  return sv_call_f32 (exp10f, x, y, special);
-}
-
-/* Single-precision SVE exp10f routine. Implements the same algorithm
-   as AdvSIMD exp10f.
-   Worst case error is 1.02 ULPs.
-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
-				  want 0x1.ba5f9cp-1.  */
-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
-{
-  const struct data *d = ptr_barrier (&data);
   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
      with poly(r) in [1/sqrt(2), sqrt(2)] and
      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
 
-  /* Load some constants in quad-word chunks to minimise memory access (last
-     lane is wasted).  */
-  svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
 
   /* n = round(x/(log10(2)/N)).  */
   svfloat32_t shift = sv_f32 (d->shift);
-  svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
-  svfloat32_t n = svsub_x (pg, z, shift);
+  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
 
   /* r = x - n*log10(2)/N.  */
-  svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
-  r = svmls_lane (r, n, log10_2_and_inv, 2);
+  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+  r = svmls_lane (r, n, lane_consts, 0);
 
-  svbool_t special = svacgt (pg, x, d->special_bound);
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
-  svfloat32_t r2 = svmul_x (pg, r, r);
-  svfloat32_t poly
-      = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
-		 sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
-
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
 
   return svmla_x (pg, scale, scale, poly);
 }
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+  return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
+		      special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+   as AdvSIMD exp10f.
+   Worst case error is 1.02 ULPs.
+   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+				  want 0x1.ba5f9cp-1.  */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacgt (pg, x, d->thres);
+  if (__glibc_unlikely (svptest_any (special, special)))
+    return special_case (x, special, d);
+  return sv_exp10f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index 8a686e3e05..c6216bed9e 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -24,54 +24,64 @@
 
 static const struct data
 {
-  float poly[5];
+  float c0, c2, c4, c1, c3;
   float shift, thres;
 } data = {
-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
-     compatibility with polynomial helpers.  */
-  .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
-	    0x1.59977ap-10f },
+  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
+  .c0 = 0x1.62e422p-1f,
+  .c1 = 0x1.ebf9bcp-3f,
+  .c2 = 0x1.c6bd32p-5f,
+  .c3 = 0x1.3ce9e4p-7f,
+  .c4 = 0x1.59977ap-10f,
   /* 1.5*2^17 + 127.  */
-  .shift = 0x1.903f8p17f,
+  .shift = 0x1.803f8p17f,
   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
      correctly by FEXPA.  */
   .thres = Thres,
 };
 
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
-  return sv_call_f32 (exp2f, x, y, special);
-}
-
-/* Single-precision SVE exp2f routine. Implements the same algorithm
-   as AdvSIMD exp2f.
-   Worst case error is 1.04 ULPs.
-   SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
-				  want 0x1.ba7ebp+0.  */
-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+static inline svfloat32_t
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
 {
-  const struct data *d = ptr_barrier (&data);
   /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
     x = n + r, with r in [-1/2, 1/2].  */
-  svfloat32_t shift = sv_f32 (d->shift);
-  svfloat32_t z = svadd_x (pg, x, shift);
-  svfloat32_t n = svsub_x (pg, z, shift);
-  svfloat32_t r = svsub_x (pg, x, n);
+  svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
+  svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
+  svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
 
-  svbool_t special = svacgt (pg, x, d->thres);
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
   /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
      Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
      coefficients 1 to 4, and apply most significant coefficient directly.  */
-  svfloat32_t r2 = svmul_x (pg, r, r);
-  svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
-  svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
+  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
 
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
-
   return svmla_x (pg, scale, scale, poly);
 }
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+  return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
+		      special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+   as AdvSIMD exp2f.
+   Worst case error is 1.04 ULPs.
+   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+				 want 0x1.ba6a64p-1.  */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacgt (pg, x, d->thres);
+  if (__glibc_unlikely (svptest_any (special, special)))
+    return special_case (x, special, d);
+  return sv_exp2f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 99d2e647aa..5c9cb72620 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -22,7 +22,7 @@
 static const struct data
 {
   float32x4_t poly[5];
-  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+  float32x4_t inv_ln2, ln2_hi, ln2_lo;
   uint32x4_t exponent_bias;
 #if !WANT_SIMD_EXCEPT
   float32x4_t special_bound, scale_thresh;
@@ -31,7 +31,6 @@ static const struct data
   /* maxerr: 1.45358 +0.5 ulp.  */
   .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
 	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
-  .shift = V4 (0x1.8p23f),
   .inv_ln2 = V4 (0x1.715476p+0f),
   .ln2_hi = V4 (0x1.62e4p-1f),
   .ln2_lo = V4 (0x1.7f7d1cp-20f),
@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, scale, p, q, poly, z;
+  float32x4_t n, r, r2, scale, p, q, poly;
   uint32x4_t cmp, e;
 
 #if WANT_SIMD_EXCEPT
@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
 
   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-  z = vfmaq_f32 (d->shift, x, d->inv_ln2);
-  n = vsubq_f32 (z, d->shift);
+  n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
   r = vfmsq_f32 (x, n, d->ln2_hi);
   r = vfmsq_f32 (r, n, d->ln2_lo);
-  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
   scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
 
 #if !WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
index 3ba79bc4f1..da93e01b87 100644
--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -18,33 +18,25 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
+#include "sv_expf_inline.h"
+
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+   correctly by FEXPA.  */
+#define Thres 0x1.5d5e2ap+6f
 
 static const struct data
 {
-  float poly[5];
-  float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  struct sv_expf_data d;
+  float thres;
 } data = {
-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
-     compatibility with polynomial helpers.  */
-  .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
-	    0x1.0e4020p-7f },
-  .inv_ln2 = 0x1.715476p+0f,
-  .ln2_hi = 0x1.62e4p-1f,
-  .ln2_lo = 0x1.7f7d1cp-20f,
-  /* 1.5*2^17 + 127.  */
-  .shift = 0x1.903f8p17f,
-  /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
-     correctly by FEXPA.  */
-  .thres = 0x1.5d5e2ap+6f,
+  .d = SV_EXPF_DATA,
+  .thres = Thres,
 };
 
-#define C(i) sv_f32 (d->poly[i])
-#define ExponentBias 0x3f800000
-
 static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
 {
-  return sv_call_f32 (expf, x, y, special);
+  return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
 }
 
 /* Optimised single-precision SVE exp function.
@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
 svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
-
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-
-  /* Load some constants in quad-word chunks to minimise memory access (last
-     lane is wasted).  */
-  svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
-
-  /* n = round(x/(ln2/N)).  */
-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
-  svfloat32_t n = svsub_x (pg, z, d->shift);
-
-  /* r = x - n*ln2/N.  */
-  svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
-  r = svmls_lane (r, n, invln2_and_ln2, 2);
-
-  /* scale = 2^(n/N).  */
   svbool_t is_special_case = svacgt (pg, x, d->thres);
-  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
-  svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
-  svfloat32_t r2 = svmul_x (pg, r, r);
-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
-  svfloat32_t p0 = svmul_x (pg, r, C (0));
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
   if (__glibc_unlikely (svptest_any (pg, is_special_case)))
-    return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
-
-  return svmla_x (pg, scale, scale, poly);
+    return special_case (x, is_special_case, &d->d);
+  return expf_inline (x, pg, &d->d);
 }
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
index a0616ec754..8303ca296e 100644
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -18,27 +18,18 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_expm1f_inline.h"
 
 static const struct data
 {
-  float32x4_t poly[5];
-  float invln2_and_ln2[4];
-  float32x4_t shift;
-  int32x4_t exponent_bias;
+  struct v_expm1f_data d;
 #if WANT_SIMD_EXCEPT
   uint32x4_t thresh;
 #else
   float32x4_t oflow_bound;
 #endif
 } data = {
-  /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2].  */
-  .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
-	    V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
-  /* Stores constants: invln2, ln2_hi, ln2_lo, 0.  */
-  .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
-  .shift = V4 (0x1.8p23f),
-  .exponent_bias = V4 (0x3f800000),
+  .d = V_EXPM1F_DATA,
 #if !WANT_SIMD_EXCEPT
   /* Value above which expm1f(x) should overflow. Absolute value of the
      underflow bound is greater than this, so it catches both cases - there is
@@ -55,67 +46,38 @@ static const struct data
 #define TinyBound v_u32 (0x34000000 << 1)
 
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
 {
-  return v_call_f32 (expm1f, x, y, special);
+  return v_call_f32 (
+      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
 }
 
 /* Single-precision vector exp(x) - 1 function.
-   The maximum error is 1.51 ULP:
-   _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
-				  want 0x1.e2fb94p-2.  */
+   The maximum error is 1.62 ULP:
+   _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+				want 0x1.da9f44p-2.  */
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
 
 #if WANT_SIMD_EXCEPT
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
   /* If fp exceptions are to be triggered correctly, fall back to scalar for
      |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
      shift-left by 1, and compare with thresh which was left-shifted offline -
      this is effectively an absolute compare.  */
   uint32x4_t special
       = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
-  if (__glibc_unlikely (v_any_u32 (special)))
-    x = v_zerofy_f32 (x, special);
 #else
   /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
   uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
 #endif
 
-  /* Reduce argument to smaller range:
-     Let i = round(x / ln2)
-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where 2^i is exact because i is an integer.  */
-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
-  float32x4_t j
-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
-  int32x4_t i = vcvtq_s32_f32 (j);
-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
-
-  /* Approximate expm1(f) using polynomial.
-     Taylor expansion for expm1(x) has the form:
-	 x + ax^2 + bx^3 + cx^4 ....
-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
-  float32x4_t p = v_horner_4_f32 (f, d->poly);
-  p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
-
-  /* Assemble the result.
-     expm1(x) ~= 2^i * (p + 1) - 1
-     Let t = 2^i.  */
-  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
-  float32x4_t t = vreinterpretq_f32_s32 (u);
-
   if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (vreinterpretq_f32_u32 (ix),
-			 vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
-			 special);
+    return special_case (x, special, d);
 
   /* expm1(x) ~= p * t + (t - 1).  */
-  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+  return expm1f_inline (x, &d->d);
 }
 libmvec_hidden_def (V_NAME_F1 (expm1))
 HALF_WIDTH_ALIAS_F1 (expm1)
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index 9347422a77..82228b599a 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -22,11 +22,11 @@
 
 static const struct data
 {
-  uint32x4_t min_norm;
+  uint32x4_t off, offset_lower_bound;
   uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
   float32x4_t poly[8];
   float32x4_t inv_ln10, ln2;
-  uint32x4_t off, mantissa_mask;
 } data = {
   /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
       [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
@@ -35,18 +35,22 @@ static const struct data
 	    V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
   .ln2 = V4 (0x1.62e43p-1f),
   .inv_ln10 = V4 (0x1.bcb7b2p-2f),
-  .min_norm = V4 (0x00800000),
-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
   .mantissa_mask = V4 (0x007fffff),
 };
 
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
-	      uint16x4_t cmp)
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
 {
   /* Fall back to scalar code.  */
-  return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+  return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
 }
 
 /* Fast implementation of AdvSIMD log10f,
@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
-  uint32x4_t u = vreinterpretq_u32_f32 (x);
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
-				 vget_low_u16 (d->special_bound));
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = vsubq_u32 (u, d->off);
+  u_off = vsubq_u32 (u_off, d->off);
   float32x4_t n = vcvtq_f32_s32 (
-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
-  u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
   float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
 
   /* y = log10(1+r) + n * log10(2).  */
@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
   y = vmulq_f32 (y, d->inv_ln10);
 
   if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (x, y, poly, r2, special);
+    return special_case (y, u_off, poly, r2, special, d);
   return vfmaq_f32 (y, poly, r2);
 }
 libmvec_hidden_def (V_NAME_F1 (log10))
diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
index bdbb49cd32..7913679f67 100644
--- a/sysdeps/aarch64/fpu/log10f_sve.c
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
@@ -24,6 +24,7 @@ static const struct data
   float poly_0246[4];
   float poly_1357[4];
   float ln2, inv_ln10;
+  uint32_t off, lower;
 } data = {
   .poly_1357 = {
     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -35,18 +36,23 @@ static const struct data
 		 -0x1.0fc92cp-4f },
   .ln2 = 0x1.62e43p-1f,
   .inv_ln10 = 0x1.bcb7b2p-2f,
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
 };
 
-#define Min 0x00800000
-#define Max 0x7f800000
-#define Thres 0x7f000000  /* Max - Min.  */
-#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000.  */
 #define MantissaMask 0x007fffff
 
 static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
 {
-  return sv_call_f32 (log10f, x, y, special);
+  return sv_call_f32 (
+      log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
 }
 
 /* Optimised implementation of SVE log10f using the same algorithm and
@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
 svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
-  svuint32_t ix = svreinterpret_u32 (x);
-  svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  ix = svsub_x (pg, ix, Offset);
   svfloat32_t n = svcvt_f32_x (
-      pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend.  */
-  ix = svand_x (pg, ix, MantissaMask);
-  ix = svadd_x (pg, ix, Offset);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend.  */
+  svuint32_t ix = svand_x (pg, u_off, MantissaMask);
+  ix = svadd_x (pg, ix, d->off);
   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
 
   /* y = log10(1+r) + n*log10(2)
      log10(1+r) ~ r * InvLn(10) + P(r)
      where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
      log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3).  */
-  svfloat32_t r2 = svmul_x (pg, r, r);
-  svfloat32_t r4 = svmul_x (pg, r2, r2);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
   svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
   svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
   hi = svmul_x (pg, hi, d->inv_ln10);
 
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
-			 special);
-  return svmla_x (pg, hi, r2, y);
+    return special_case (u_off, hi, r2, y, special);
+  return svmla_x (svptrue_b32 (), hi, r2, y);
 }
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
index ffc418fc9c..114064c696 100644
--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
@@ -127,3 +127,5 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
 
   return vfmaq_f64 (y, f2, p);
 }
+
+strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
index 04f7e5720e..b21cfb2c90 100644
--- a/sysdeps/aarch64/fpu/log1p_sve.c
+++ b/sysdeps/aarch64/fpu/log1p_sve.c
@@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
 
   return y;
 }
+
+strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
index dc15334a85..00006fc703 100644
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -18,113 +18,81 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
 
 const static struct data
 {
-  float32x4_t poly[8], ln2;
-  uint32x4_t tiny_bound, minus_one, four, thresh;
-  int32x4_t three_quarters;
+  uint32x4_t minus_one, thresh;
+  struct v_log1pf_data d;
 } data = {
-  .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
-	       (1, -0.5) are not stored as they can be generated more
-	       efficiently.  */
-	    V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
-	    V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
-  .ln2 = V4 (0x1.62e43p-1f),
-  .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
-  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound.  */
+  .d = V_LOG1PF_CONSTANTS_TABLE,
+  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
   .minus_one = V4 (0xbf800000),
-  .four = V4 (0x40800000),
-  .three_quarters = V4 (0x3f400000)
 };
 
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *p)
-{
-  /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme.  */
-  float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
-  float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
-  float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
-  float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
-
-  float32x4_t m2 = vmulq_f32 (m, m);
-  float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
-  float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
-  float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
-
-  float32x4_t m4 = vmulq_f32 (m2, m2);
-  float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
-  return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
-}
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#  define TinyBound v_u32 (0x34000000)
 
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
 {
-  return v_call_f32 (log1pf, x, y, special);
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
 }
 
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
-   is roughly 2.02 ULP:
-   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.69 ULP:
+   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+				 want 0x1.cfcbdcp-3.  */
 VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
-
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
   uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
   uint32x4_t special_cases
-      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
 		   vcgeq_u32 (ix, d->minus_one));
-  float32x4_t special_arg = x;
 
-#if WANT_SIMD_EXCEPT
   if (__glibc_unlikely (v_any_u32 (special_cases)))
-    /* Side-step special lanes so fenv exceptions are not triggered
-       inadvertently.  */
-    x = v_zerofy_f32 (x, special_cases);
-#endif
+    return special_case (x, special_cases, d);
 
-  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
-			   is in [-0.25, 0.5]):
-     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
-     We approximate log1p(m) with a polynomial, then scale by
-     k*log(2). Instead of doing this directly, we use an intermediate
-     scale factor s = 4*k*log(2) to ensure the scale is representable
-     as a normalised fp32 number.  */
+  return log1pf_inline (x, &d->d);
+}
 
-  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+#else
 
-  /* Choose k to scale x to the range [-1/4, 1/2].  */
-  int32x4_t k
-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
-		   v_s32 (0xff800000));
-  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
 
-  /* Scale x by exponent manipulation.  */
-  float32x4_t m_scale
-      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+  return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
 
-  /* Scale up to ensure that the scale factor is representable as normalised
-     fp32 number, and scale m down accordingly.  */
-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
-  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.63 ULP:
+   _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+				 want 0x1.fdcb16p-3.  */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+  uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+					vcaleq_f32 (x, v_f32 (0x1p127f)));
 
-  /* Evaluate polynomial on the reduced interval.  */
-  float32x4_t p = eval_poly (m_scale, d->poly);
+  if (__glibc_unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases);
 
-  /* The scale factor to be applied back at the end - by multiplying float(k)
-     by 2^-23 we get the unbiased exponent of k.  */
-  float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+  return log1pf_inline (x, ptr_barrier (&data));
+}
 
-  /* Apply the scaling back.  */
-  float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+#endif
 
-  if (__glibc_unlikely (v_any_u32 (special_cases)))
-    return special_case (special_arg, y, special_cases);
-  return y;
-}
 libmvec_hidden_def (V_NAME_F1 (log1p))
 HALF_WIDTH_ALIAS_F1 (log1p)
+strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
+libmvec_hidden_def (V_NAME_F1 (logp1))
+HALF_WIDTH_ALIAS_F1 (logp1)
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
index f645cc997e..5256d5e94c 100644
--- a/sysdeps/aarch64/fpu/log1pf_sve.c
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
@@ -98,3 +98,5 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
 
   return y;
 }
+
+strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index db21836749..84effe4fe9 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -22,9 +22,9 @@
 
 static const struct data
 {
-  uint32x4_t min_norm;
+  uint32x4_t off, offset_lower_bound;
   uint16x8_t special_bound;
-  uint32x4_t off, mantissa_mask;
+  uint32x4_t mantissa_mask;
   float32x4_t poly[9];
 } data = {
   /* Coefficients generated using Remez algorithm approximate
@@ -34,18 +34,22 @@ static const struct data
 	    V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
 	    V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
 	    V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
-  .min_norm = V4 (0x00800000),
-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
   .mantissa_mask = V4 (0x007fffff),
 };
 
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
-	      uint16x4_t cmp)
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+	      uint16x4_t cmp, const struct data *d)
 {
   /* Fall back to scalar code.  */
-  return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+  return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
 }
 
 /* Fast implementation for single precision AdvSIMD log2,
@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
-  uint32x4_t u = vreinterpretq_u32_f32 (x);
-  uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
-				 vget_low_u16 (d->special_bound));
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint32x4_t u_off = vreinterpretq_u32_f32 (x);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = vsubq_u32 (u, d->off);
+  u_off = vsubq_u32 (u_off, d->off);
   float32x4_t n = vcvtq_f32_s32 (
-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
-  u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+				 vget_low_u16 (d->special_bound));
+
+  uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
   float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
 
   /* y = log2(1+r) + n.  */
@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
   float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
 
   if (__glibc_unlikely (v_any_u16h (special)))
-    return special_case (x, n, p, r, special);
+    return special_case (n, u_off, p, r, special, d);
   return vfmaq_f32 (n, p, r);
 }
 libmvec_hidden_def (V_NAME_F1 (log2))
diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
index 5031c42483..939d89bfb9 100644
--- a/sysdeps/aarch64/fpu/log2f_sve.c
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
@@ -23,6 +23,7 @@ static const struct data
 {
   float poly_02468[5];
   float poly_1357[4];
+  uint32_t off, lower;
 } data = {
   .poly_1357 = {
     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -32,18 +33,23 @@ static const struct data
   },
   .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
 		  0x1.9d8ecap-3f, 0x1.9e495p-3f },
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
 };
 
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thres (0x7f000000) /* Max - Min.  */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
 #define MantissaMask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667.  */
 
 static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
 {
-  return sv_call_f32 (log2f, x, y, cmp);
+  return sv_call_f32 (
+      log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
 }
 
 /* Optimised implementation of SVE log2f, using the same algorithm
@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svuint32_t u = svreinterpret_u32 (x);
-  svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = svsub_x (pg, u, Off);
   svfloat32_t n = svcvt_f32_x (
-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
-  u = svand_x (pg, u, MantissaMask);
-  u = svadd_x (pg, u, Off);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
+  svuint32_t u = svand_x (pg, u_off, MantissaMask);
+  u = svadd_x (pg, u, d->off);
   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
 
   /* y = log2(1+r) + n.  */
-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
 
   /* Evaluate polynomial using pairwise Horner scheme.  */
   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
   y = svmla_x (pg, q_01, r2, y);
 
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
-  return svmla_x (pg, n, r, y);
+    return special_case (u_off, n, r, y, special);
+  return svmla_x (svptrue_b32 (), n, r, y);
 }
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 3c0d0fcdc7..c20dbfd6c0 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,20 +21,22 @@
 
 static const struct data
 {
-  uint32x4_t min_norm;
+  uint32x4_t off, offset_lower_bound;
   uint16x8_t special_bound;
+  uint32x4_t mantissa_mask;
   float32x4_t poly[7];
-  float32x4_t ln2, tiny_bound;
-  uint32x4_t off, mantissa_mask;
+  float32x4_t ln2;
 } data = {
   /* 3.34 ulp error.  */
   .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
 	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
 	    V4 (-0x1.ffffc8p-2f) },
   .ln2 = V4 (0x1.62e43p-1f),
-  .tiny_bound = V4 (0x1p-126),
-  .min_norm = V4 (0x00800000),
-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+  .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000).  */
   .off = V4 (0x3f2aaaab),	/* 0.666667.  */
   .mantissa_mask = V4 (0x007fffff)
 };
@@ -42,32 +44,37 @@ static const struct data
 #define P(i) d->poly[7 - i]
 
 static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
-	      uint16x4_t cmp)
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+	      uint16x4_t cmp, const struct data *d)
 {
   /* Fall back to scalar code.  */
-  return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+  return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+		     vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
 }
 
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, p, q, r, r2, y;
-  uint32x4_t u;
+  uint32x4_t u, u_off;
   uint16x4_t cmp;
 
-  u = vreinterpretq_u32_f32 (x);
-  cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
-		  vget_low_u16 (d->special_bound));
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  u_off = vreinterpretq_u32_f32 (x);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = vsubq_u32 (u, d->off);
+  u_off = vsubq_u32 (u_off, d->off);
   n = vcvtq_f32_s32 (
-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
-  u = vandq_u32 (u, d->mantissa_mask);
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend.  */
+  u = vandq_u32 (u_off, d->mantissa_mask);
   u = vaddq_u32 (u, d->off);
   r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
 
+  cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+		  vget_low_u16 (d->special_bound));
+
   /* y = log(1+r) + n*ln2.  */
   r2 = vmulq_f32 (r, r);
   /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
   p = vfmaq_f32 (r, d->ln2, n);
 
   if (__glibc_unlikely (v_any_u16h (cmp)))
-    return special_case (x, y, r2, p, cmp);
+    return special_case (p, u_off, y, r2, cmp, d);
   return vfmaq_f32 (p, y, r2);
 }
 libmvec_hidden_def (V_NAME_F1 (log))
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
index d64e810cfe..5b9324678d 100644
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -24,6 +24,7 @@ static const struct data
   float poly_0135[4];
   float poly_246[3];
   float ln2;
+  uint32_t off, lower;
 } data = {
   .poly_0135 = {
     /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
@@ -32,19 +33,24 @@ static const struct data
     -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
   },
   .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
-  .ln2 = 0x1.62e43p-1f
+  .ln2 = 0x1.62e43p-1f,
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
 };
 
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thresh (0x7f000000) /* Max - Min.  */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
 #define Mask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667.  */
 
 static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
 {
-  return sv_call_f32 (logf, x, y, cmp);
+  return sv_call_f32 (
+      logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
 }
 
 /* Optimised implementation of SVE logf, using the same algorithm and
@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svuint32_t u = svreinterpret_u32 (x);
-  svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = svsub_x (pg, u, Off);
   svfloat32_t n = svcvt_f32_x (
-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
-  u = svand_x (pg, u, Mask);
-  u = svadd_x (pg, u, Off);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
+
+  svuint32_t u = svand_x (pg, u_off, Mask);
+  u = svadd_x (pg, u, d->off);
   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
 
   /* y = log(1+r) + n*ln2.  */
-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
   /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
   svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
   svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
   p = svmla_x (pg, r, n, d->ln2);
 
   if (__glibc_unlikely (svptest_any (pg, cmp)))
-    return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+    return special_case (u_off, p, r2, y, cmp);
   return svmla_x (pg, p, r2, y);
 }
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
index a0d9d3b819..718125cbad 100644
--- a/sysdeps/aarch64/fpu/sin_advsimd.c
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
@@ -22,7 +22,7 @@
 static const struct data
 {
   float64x2_t poly[7];
-  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+  float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
   .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
 	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@@ -34,12 +34,13 @@ static const struct data
   .pi_1 = V2 (0x1.921fb54442d18p+1),
   .pi_2 = V2 (0x1.1a62633145c06p-53),
   .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
 };
 
 #if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
-# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u64 (0x1160000000000000)
 #endif
 
 #define C(i) d->poly[i]
@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
      fenv). These lanes will be fixed by special-case handler later.  */
   uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
   cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+  r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
 #else
   r = x;
   cmp = vcageq_f64 (x, d->range_val);
 #endif
 
   /* n = rint(|x|/pi).  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
+  n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
   r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
index 375dfc3331..6ee9a23d5b 100644
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -22,7 +22,7 @@
 static const struct data
 {
   float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+  float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
 } data = {
   /* 1.886 ulp error.  */
   .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,13 +33,14 @@ static const struct data
   .pi_3 = V4 (-0x1.ee59dap-49f),
 
   .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
   .range_val = V4 (0x1p20f)
 };
 
 #if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
-# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows.  */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound.  */
+# define Thresh v_u32 (0x27800000)
 #endif
 
 #define C(i) d->poly[i]
@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
   /* If fenv exceptions are to be triggered correctly, set any special lanes
      to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
      special-case handler later.  */
-  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+  r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
 #else
   r = x;
   cmp = vcageq_f32 (x, d->range_val);
 #endif
 
-  /* n = rint(|x|/pi) */
-  n = vfmaq_f32 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
+  /* n = rint(|x|/pi).  */
+  n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
 
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
   r = vfmsq_f32 (r, d->pi_1, n);
   r = vfmsq_f32 (r, d->pi_2, n);
   r = vfmsq_f32 (r, d->pi_3, n);
 
-  /* y = sin(r) */
+  /* y = sin(r).  */
   r2 = vmulq_f32 (r, r);
   y = vfmaq_f32 (C (2), C (3), r2);
   y = vfmaq_f32 (C (1), y, r2);
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
index 6bb7482dc2..c6ed7598e7 100644
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -23,15 +23,13 @@
 static const struct data
 {
   struct v_expm1f_data expm1f_consts;
-  uint32x4_t halff;
 #if WANT_SIMD_EXCEPT
   uint32x4_t tiny_bound, thresh;
 #else
-  uint32x4_t oflow_bound;
+  float32x4_t oflow_bound;
 #endif
 } data = {
   .expm1f_consts = V_EXPM1F_DATA,
-  .halff = V4 (0x3f000000),
 #if WANT_SIMD_EXCEPT
   /* 0x1.6a09e8p-32, below which expm1f underflows.  */
   .tiny_bound = V4 (0x2fb504f4),
@@ -39,14 +37,15 @@ static const struct data
   .thresh = V4 (0x12fbbbb3),
 #else
   /* 0x1.61814ep+6, above which expm1f helper overflows.  */
-  .oflow_bound = V4 (0x42b0c0a7),
+  .oflow_bound = V4 (0x1.61814ep+6),
 #endif
 };
 
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+	      uint32x4_t special)
 {
-  return v_call_f32 (sinhf, x, y, special);
+  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
 }
 
 /* Approximation for vector single-precision sinh(x) using expm1.
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
 
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
   float32x4_t ax = vabsq_f32 (x);
-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
-  uint32x4_t sign = veorq_u32 (ix, iax);
-  float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+  float32x4_t halfsign = vreinterpretq_f32_u32 (
+      vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
 
 #if WANT_SIMD_EXCEPT
-  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+  uint32x4_t special = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
   ax = v_zerofy_f32 (ax, special);
 #else
-  uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
 #endif
 
   /* Up to the point that expm1f overflows, we can use it to calculate sinhf
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
   /* Fall back to the scalar variant for any lanes that should trigger an
      exception.  */
   if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (x, vmulq_f32 (t, halfsign), special);
+    return special_case (x, t, halfsign, special);
 
   return vmulq_f32 (t, halfsign);
 }
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index 23963b5f8e..6166df6553 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -24,19 +24,20 @@
 
 struct sv_expf_data
 {
-  float poly[5];
-  float inv_ln2, ln2_hi, ln2_lo, shift;
+  float c1, c3, inv_ln2;
+  float ln2_lo, c0, c2, c4;
+  float ln2_hi, shift;
 };
 
 /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
    compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
 #define SV_EXPF_DATA                                                          \
   {                                                                           \
-    .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
-	      0x1.0e4020p-7f },                                               \
-                                                                              \
-    .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,                        \
-    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
+    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
+    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+    .shift = 0x1.803f8p17f,                                                   \
   }
 
 #define C(i) sv_f32 (d->poly[i])
@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 
-  /* Load some constants in quad-word chunks to minimise memory access.  */
-  svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
 
   /* n = round(x/(ln2/N)).  */
-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+  svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
   svfloat32_t n = svsub_x (pg, z, d->shift);
 
   /* r = x - n*ln2/N.  */
-  svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
-  r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+  r = svmls_lane (r, n, lane_consts, 0);
 
   /* scale = 2^(n/N).  */
-  svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
-  svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
-  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
   svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
-  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
 
   return svmla_x (pg, scale, scale, poly);
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
index 50defd6ef0..3ced9b7a41 100644
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -28,13 +28,16 @@ static const struct data
   /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
   .boring_bound = V4 (0x41102cb3),
   .large_bound = V4 (0x7f800000),
-  .onef = V4 (0x3f800000),
 };
 
 static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+	      float32x4_t q, uint32x4_t special)
 {
-  return v_call_f32 (tanhf, x, y, special);
+  return v_call_f32 (
+      tanhf, x,
+      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+      special);
 }
 
 /* Approximation for single-precision vector tanh(x), using a simplified
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
   uint32x4_t iax = vreinterpretq_u32_f32 (ax);
   uint32x4_t sign = veorq_u32 (ix, iax);
   uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
-  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
+  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
 
 #if WANT_SIMD_EXCEPT
   /* If fp exceptions are to be triggered properly, set all special and boring
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
 
   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
   float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
-  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
   if (__glibc_unlikely (v_any_u32 (special)))
-    return special_case (vreinterpretq_f32_u32 (ix),
-			 vbslq_f32 (is_boring, boring, y), special);
+    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+			 special);
+
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
   return vbslq_f32 (is_boring, boring, y);
 }
 libmvec_hidden_def (V_NAME_F1 (tanh))
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
index 59b552da6b..1daedfdd51 100644
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -21,48 +21,47 @@
 #define AARCH64_FPU_V_EXPM1F_INLINE_H
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "math_config.h"
 
 struct v_expm1f_data
 {
-  float32x4_t poly[5];
-  float invln2_and_ln2[4];
-  float32x4_t shift;
+  float32x4_t c0, c2;
   int32x4_t exponent_bias;
+  float c1, c3, inv_ln2, c4;
+  float ln2_hi, ln2_lo;
 };
 
 /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
-   log(2)/2]. Exponent bias is asuint(1.0f).
-   invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0.  */
+   log(2)/2]. Exponent bias is asuint(1.0f).  */
 #define V_EXPM1F_DATA                                                         \
   {                                                                           \
-    .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),     \
-	      V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },                      \
-    .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000),                \
-    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+    .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5),  \
+    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+    .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f,              \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
   }
 
 static inline float32x4_t
 expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
 {
-  /* Helper routine for calculating exp(x) - 1.
-     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
-     calling routine should handle special values if required.  */
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+  float32x4_t lane_consts = vld1q_f32 (&d->c1);
 
   /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
-  float32x4_t j
-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
   int32x4_t i = vcvtq_s32_f32 (j);
-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+  float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+  f = vfmsq_lane_f32 (f, j, ln2, 1);
 
-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
-     Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
-     Horner.  */
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
   float32x4_t f2 = vmulq_f32 (f, f);
   float32x4_t f4 = vmulq_f32 (f2, f2);
-  float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+  float32x4_t p = vfmaq_f32 (p01, f2, p23);
+  p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
   p = vfmaq_f32 (f, f2, p);
 
   /* t = 2^i.  */
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
index 643a6cdcfc..73e45a942e 100644
--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
@@ -25,54 +25,81 @@
 
 struct v_log1pf_data
 {
-  float32x4_t poly[8], ln2;
   uint32x4_t four;
   int32x4_t three_quarters;
+  float c0, c3, c5, c7;
+  float32x4_t c4, c6, c1, c2, ln2;
 };
 
 /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
    (1, -0.5) are not stored as they can be generated more efficiently.  */
 #define V_LOG1PF_CONSTANTS_TABLE                                              \
   {                                                                           \
-    .poly                                                                     \
-	= { V4 (0x1.5555aap-2f),  V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),  \
-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f),	V4 (-0x1.0da91p-3f),  \
-	    V4 (0x1.abcb6p-4f),	  V4 (-0x1.6f0d5ep-5f) },                     \
-	.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                   \
-	.three_quarters = V4 (0x3f400000)                                     \
+    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
+    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
+    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
+    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
+    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
+    .three_quarters = V4 (0x3f400000)                                         \
   }
 
 static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *c)
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
 {
-  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
-     uses split Estrin, but this way reduces register pressure in the calling
-     routine).  */
-  float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float32x4_t c0357 = vld1q_f32 (&d->c0);
+  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
   float32x4_t m2 = vmulq_f32 (m, m);
-  q = vfmaq_f32 (m, m2, q);
-  float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+  float32x4_t p = vfmaq_f32 (p45, m2, p67);
+  p = vfmaq_f32 (p23, m2, p);
+  p = vfmaq_f32 (d->c1, m, p);
   p = vmulq_f32 (m2, p);
-  return vfmaq_f32 (q, m2, p);
+  p = vfmaq_f32 (m, m2, p);
+  return vfmaq_f32 (p, m2, q);
 }
 
 static inline float32x4_t
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
 {
-  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
-     special-case handling. See that file for details of the algorithm.  */
+  /* Helper for calculating log(x + 1).  */
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
   float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
   int32x4_t k
-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
 		   v_s32 (0xff800000));
   uint32x4_t ku = vreinterpretq_u32_s32 (k);
-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+  /* Scale x by exponent manipulation.  */
   float32x4_t m_scale
       = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
   m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
-  float32x4_t p = eval_poly (m_scale, d.poly);
+
+  /* Evaluate polynomial on the reduced interval.  */
+  float32x4_t p = eval_poly (m_scale, d);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
   float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
-  return vfmaq_f32 (p, scale_back, d.ln2);
+
+  /* Apply the scaling back.  */
+  return vfmaq_f32 (p, scale_back, d->ln2);
 }
 
 #endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 6c96304611..846fb2c29e 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1082,24 +1082,24 @@ float: 1
 ldouble: 3
 
 Function: "exp10m1":
-double: 2
-float: 1
-ldouble: 1
+double: 4
+float: 2
+ldouble: 3
 
 Function: "exp10m1_downward":
-double: 1
-float: 1
-ldouble: 3
+double: 3
+float: 3
+ldouble: 6
 
 Function: "exp10m1_towardzero":
-double: 1
-float: 1
-ldouble: 3
+double: 2
+float: 3
+ldouble: 6
 
 Function: "exp10m1_upward":
-double: 3
-float: 1
-ldouble: 3
+double: 5
+float: 3
+ldouble: 6
 
 Function: "exp2":
 double: 1
@@ -1130,24 +1130,24 @@ float: 1
 ldouble: 2
 
 Function: "exp2m1":
-double: 1
-float: 1
-ldouble: 1
+double: 2
+float: 2
+ldouble: 2
 
 Function: "exp2m1_downward":
-double: 1
-float: 1
-ldouble: 2
+double: 3
+float: 3
+ldouble: 3
 
 Function: "exp2m1_towardzero":
-double: 2
-float: 1
-ldouble: 2
+double: 3
+float: 2
+ldouble: 4
 
 Function: "exp2m1_upward":
-double: 1
-float: 1
-ldouble: 2
+double: 3
+float: 3
+ldouble: 5
 
 Function: "exp_advsimd":
 double: 1
@@ -1356,24 +1356,24 @@ float: 2
 ldouble: 1
 
 Function: "log10p1":
-double: 1
-float: 1
+double: 2
+float: 2
 ldouble: 3
 
 Function: "log10p1_downward":
 double: 2
-float: 1
-ldouble: 2
+float: 3
+ldouble: 4
 
 Function: "log10p1_towardzero":
-double: 2
+double: 3
 float: 2
-ldouble: 2
+ldouble: 3
 
 Function: "log10p1_upward":
 double: 2
-float: 1
-ldouble: 3
+float: 3
+ldouble: 4
 
 Function: "log1p":
 double: 1
@@ -1432,8 +1432,8 @@ float: 3
 ldouble: 1
 
 Function: "log2p1":
-double: 1
-float: 1
+double: 2
+float: 2
 ldouble: 3
 
 Function: "log2p1_downward":
@@ -1447,9 +1447,9 @@ float: 2
 ldouble: 2
 
 Function: "log2p1_upward":
-double: 1
+double: 2
 float: 2
-ldouble: 2
+ldouble: 3
 
 Function: "log_advsimd":
 double: 1
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
deleted file mode 100644
index 6c7f60b37e..0000000000
--- a/sysdeps/aarch64/memset-reg.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Register aliases for memset to be used across implementations.
-   Copyright (C) 2017-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define dstin	x0
-#define val	x1
-#define valw	w1
-#define count	x2
-#define dst	x3
-#define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 7ef77ee8c9..b76dde1557 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include "memset-reg.h"
 
 #ifndef MEMSET
 # define MEMSET memset
@@ -25,130 +25,131 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
-ENTRY (MEMSET)
+#define dstin	x0
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define off	x3
+#define dstend2	x5
 
+ENTRY (MEMSET)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 
 	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_small)
+
 	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
 
-	cmp	count, 96
-	b.hi	L(set_long)
-	cmp	count, 16
-	b.hs	L(set_medium)
-	mov	val, v0.D[0]
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
 
+	.p2align 4
 	/* Set 0..15 bytes.  */
-	tbz	count, 3, 1f
-	str	val, [dstin]
-	str	val, [dstend, -8]
-	ret
-	nop
-1:	tbz	count, 2, 2f
-	str	valw, [dstin]
-	str	valw, [dstend, -4]
+L(set_small):
+	add	dstend, dstin, count
+	cmp	count, 4
+	b.lo	2f
+	lsr	off, count, 3
+	sub	dstend2, dstend, off, lsl 2
+	str	s0, [dstin]
+	str	s0, [dstin, off, lsl 2]
+	str	s0, [dstend2, -4]
+	str	s0, [dstend, -4]
 	ret
+
+	/* Set 0..3 bytes.  */
 2:	cbz	count, 3f
+	lsr	off, count, 1
 	strb	valw, [dstin]
-	tbz	count, 1, 3f
-	strh	valw, [dstend, -2]
+	strb	valw, [dstin, off]
+	strb	valw, [dstend, -1]
 3:	ret
 
-	/* Set 17..96 bytes.  */
-L(set_medium):
-	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
-	str	q0, [dstend, -16]
-	tbz	count, 5, 1f
-	str	q0, [dstin, 16]
-	str	q0, [dstend, -32]
-1:	ret
-
 	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set96):
-	str	q0, [dstin, 16]
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
 	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
-	and	valw, valw, 255
-	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
+	str	q0, [dst, 16]
+	tst	valw, 255
+	b.ne	L(no_zva)
+#ifndef ZVA64_ONLY
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(zva_128)
+#endif
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+
+	/* Write last bytes before ZVA loop.  */
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
 	ret
 
-L(try_zva):
-#ifndef ZVA64_ONLY
 	.p2align 3
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-	nop
-#endif
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-	.p2align 4
-L(zva_64):
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 32 too large.  */
+	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
 	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
 	add	dst, dst, 64
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 #ifndef ZVA64_ONLY
-	.p2align 3
+	.p2align 4
 L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
+	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+	b.ne	L(no_zva)
 
-	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]
 	stp	q0, q0, [dst, 96]
 	bic	dst, dst, 127
 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
+	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+1:	add	dst, dst, 128
+	dc	zva, dst
 	subs	count, count, 128
 	b.hi	1b
 	stp	q0, q0, [dstend, -128]
@@ -156,35 +157,6 @@ L(zva_128):
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
-
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
-	b	L(tail64)
 #endif
 
 END (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index 2e6d882fc9..f665b5a891 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -18,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <sysdeps/aarch64/memset-reg.h>
 
 /* Assumptions:
  *
@@ -36,6 +35,14 @@
 
 	.arch armv8.2-a+sve
 
+#define dstin   x0
+#define valw    w1
+#define count   x2
+#define dst     x3
+#define dstend  x4
+#define tmp1    x5
+#define tmp2    x6
+
 	.macro st1b_unroll first=0, last=7
 	st1b	z0.b, p0, [dst, \first, mul vl]
 	.if \last-\first
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 6d714ed0e1..cf1b25f2ed 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -18,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include "memset-reg.h"
 
 /* Assumptions:
  *
@@ -26,6 +25,13 @@
  *
  */
 
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+
 ENTRY (__memset_emag)
 
 	PTR_ARG (0)
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
index 7b21550137..f815c20b03 100644
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -18,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <sysdeps/aarch64/memset-reg.h>
 
 /* Assumptions:
  *
@@ -26,6 +25,12 @@
  *
  */
 
+#define dstin	x0
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+
 ENTRY (__memset_kunpeng)
 
 	PTR_ARG (0)
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index b43a43b54e..6fa28a9bd0 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -19,12 +19,18 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include "memset-reg.h"
 
 /* Assumptions:
    ARMv8-a, AArch64, unaligned accesses
  */
 
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+
 ENTRY (__memset_oryon1)
 
 	PTR_ARG (0)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index ab2a576cdb..352fb40d3a 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
-	cbz	synd, L(loop)
+	cbz	synd, L(next16)
 
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 
+L(next16):
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	src, src, 16
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	result, src, srcin
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbnz	synd, L(loop_end)
-	ldr	data, [src, 32]!
+	ldr	data, [src, 16]
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	sub	src, src, 16
+	add	src, src, 16
 L(loop_end):
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
-	sub	result, src, srcin
-	fmov	synd, dend
+	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
 #ifndef __AARCH64EB__
 	rbit	synd, synd
+	sub	result, result, 3
 #endif
-	add	result, result, 16
 	clz	tmp, synd
-	add	result, result, tmp, lsr 2
+	sub	result, tmp, result
+	lsr	result, result, 2
 	ret
 
 END (STRLEN)