From 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5 Mon Sep 17 00:00:00 2001
From: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu, 2 May 2024 16:43:13 +0100
Subject: aarch64: Fix AdvSIMD libmvec routines for big-endian

Previously many routines used * to load from vector types stored
in the data table. This is emitted as ldr, which byte-swaps the
entire vector register, and causes bugs for big-endian when not
all lanes contain the same value. When a vector is to be used
this way, it has been replaced with an array and the load with an
explicit ld1 intrinsic, which byte-swaps only within lanes.

As well, many routines previously used non-standard GCC syntax
for vector operations such as indexing into vectors types with []
and assembling vectors using {}. This syntax should not be mixed
with ACLE, as the former does not respect endianness whereas the
latter does. Such examples have been replaced with, for instance,
vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
GCC syntax, such as the v_call helpers, do not need changing as
they do not use intrinsics.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
 sysdeps/aarch64/fpu/tan_advsimd.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'sysdeps/aarch64/fpu/tan_advsimd.c')

diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index 0459821ab2..d56a102dd1 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -23,7 +23,8 @@
 static const struct data
 {
   float64x2_t poly[9];
-  float64x2_t half_pi, two_over_pi, shift;
+  double half_pi[2];
+  float64x2_t two_over_pi, shift;
 #if !WANT_SIMD_EXCEPT
   float64x2_t range_val;
 #endif
@@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
   /* Use q to reduce x to r in [-pi/4, pi/4], by:
      r = x - q * pi/2, in extended precision.  */
   float64x2_t r = x;
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
-  r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
+  float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  r = vfmsq_laneq_f64 (r, q, half_pi, 1);
   /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
      formula.  */
   r = vmulq_n_f64 (r, 0.5);
-- 
cgit 1.4.1