about summary refs log tree commit diff
path: root/sysdeps/aarch64/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64/multiarch')
-rw-r--r--sysdeps/aarch64/multiarch/Makefile2
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c8
-rw-r--r--sysdeps/aarch64/multiarch/init-arch.h4
-rw-r--r--sysdeps/aarch64/multiarch/memcpy.c18
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_a64fx.S406
-rw-r--r--sysdeps/aarch64/multiarch/memmove.c18
6 files changed, 443 insertions, 13 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index dc3efffb36..04c3f17121 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,6 +1,6 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
-		   memcpy_falkor \
+		   memcpy_falkor memcpy_a64fx \
 		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
 		   strlen_mte strlen_asimd
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 99a8c68aac..911393565c 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	4
+#define MAX_IFUNC	7
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -43,12 +43,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
+#if HAVE_AARCH64_SVE_ASM
+	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
+#endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
+#if HAVE_AARCH64_SVE_ASM
+	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
+#endif
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
 	      /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index a167699e74..6d92c1bcff 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -33,4 +33,6 @@
   bool __attribute__((unused)) bti =					      \
     HAVE_AARCH64_BTI && GLRO(dl_aarch64_cpu_features).bti;		      \
   bool __attribute__((unused)) mte =					      \
-    MTE_ENABLED ();
+    MTE_ENABLED ();							      \
+  bool __attribute__((unused)) sve =					      \
+    GLRO(dl_aarch64_cpu_features).sve;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 0e0a5cbcfb..25e0081eeb 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+# if HAVE_AARCH64_SVE_ASM
+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
+# endif
 
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
@@ -40,12 +43,17 @@ libc_ifunc (__libc_memcpy,
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
 		? __memcpy_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
-		  ? __memcpy_thunderx2
-		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-		     || IS_NEOVERSE_V1 (midr)
-		     ? __memcpy_simd
+		   ? __memcpy_thunderx2
+		   : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+		      || IS_NEOVERSE_V1 (midr)
+		      ? __memcpy_simd
+# if HAVE_AARCH64_SVE_ASM
+		     : (IS_A64FX (midr)
+			? __memcpy_a64fx
+			: __memcpy_generic))))));
+# else
 		     : __memcpy_generic)))));
-
+# endif
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
 #endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
new file mode 100644
index 0000000000..65528405bb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
@@ -0,0 +1,406 @@
+/* Optimized memcpy for Fujitsu A64FX processor.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
+ *
+ */
+
+#define L2_SIZE		(8*1024*1024)/2	// L2 8MB/2
+#define CACHE_LINE_SIZE	256
+#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
+#define dest		x0
+#define src		x1
+#define n		x2	// size
+#define tmp1		x3
+#define tmp2		x4
+#define tmp3		x5
+#define rest		x6
+#define dest_ptr	x7
+#define src_ptr		x8
+#define vector_length	x9
+#define cl_remainder	x10	// CACHE_LINE_SIZE remainder
+
+#if HAVE_AARCH64_SVE_ASM
+# if IS_IN (libc)
+#  define MEMCPY __memcpy_a64fx
+#  define MEMMOVE __memmove_a64fx
+
+	.arch armv8.2-a+sve
+
+	.macro dc_zva times
+	dc	zva, tmp1
+	add	tmp1, tmp1, CACHE_LINE_SIZE
+	.if \times-1
+	dc_zva "(\times-1)"
+	.endif
+	.endm
+
+	.macro ld1b_unroll8
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
+	ld1b	z1.b, p0/z, [src_ptr, #1, mul vl]
+	ld1b	z2.b, p0/z, [src_ptr, #2, mul vl]
+	ld1b	z3.b, p0/z, [src_ptr, #3, mul vl]
+	ld1b	z4.b, p0/z, [src_ptr, #4, mul vl]
+	ld1b	z5.b, p0/z, [src_ptr, #5, mul vl]
+	ld1b	z6.b, p0/z, [src_ptr, #6, mul vl]
+	ld1b	z7.b, p0/z, [src_ptr, #7, mul vl]
+	.endm
+
+	.macro stld1b_unroll4a
+	st1b	z0.b, p0,   [dest_ptr, #0, mul vl]
+	st1b	z1.b, p0,   [dest_ptr, #1, mul vl]
+	ld1b	z0.b, p0/z, [src_ptr,  #0, mul vl]
+	ld1b	z1.b, p0/z, [src_ptr,  #1, mul vl]
+	st1b	z2.b, p0,   [dest_ptr, #2, mul vl]
+	st1b	z3.b, p0,   [dest_ptr, #3, mul vl]
+	ld1b	z2.b, p0/z, [src_ptr,  #2, mul vl]
+	ld1b	z3.b, p0/z, [src_ptr,  #3, mul vl]
+	.endm
+
+	.macro stld1b_unroll4b
+	st1b	z4.b, p0,   [dest_ptr, #4, mul vl]
+	st1b	z5.b, p0,   [dest_ptr, #5, mul vl]
+	ld1b	z4.b, p0/z, [src_ptr,  #4, mul vl]
+	ld1b	z5.b, p0/z, [src_ptr,  #5, mul vl]
+	st1b	z6.b, p0,   [dest_ptr, #6, mul vl]
+	st1b	z7.b, p0,   [dest_ptr, #7, mul vl]
+	ld1b	z6.b, p0/z, [src_ptr,  #6, mul vl]
+	ld1b	z7.b, p0/z, [src_ptr,  #7, mul vl]
+	.endm
+
+	.macro stld1b_unroll8
+	stld1b_unroll4a
+	stld1b_unroll4b
+	.endm
+
+	.macro st1b_unroll8
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
+	st1b	z1.b, p0, [dest_ptr, #1, mul vl]
+	st1b	z2.b, p0, [dest_ptr, #2, mul vl]
+	st1b	z3.b, p0, [dest_ptr, #3, mul vl]
+	st1b	z4.b, p0, [dest_ptr, #4, mul vl]
+	st1b	z5.b, p0, [dest_ptr, #5, mul vl]
+	st1b	z6.b, p0, [dest_ptr, #6, mul vl]
+	st1b	z7.b, p0, [dest_ptr, #7, mul vl]
+	.endm
+
+	.macro shortcut_for_small_size exit
+	// if rest <= vector_length * 2
+	whilelo	p0.b, xzr, n
+	whilelo	p1.b, vector_length, n
+	b.last	1f
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
+	st1b	z0.b, p0, [dest, #0, mul vl]
+	st1b	z1.b, p1, [dest, #1, mul vl]
+	ret
+1:	// if rest > vector_length * 8
+	cmp	n, vector_length, lsl 3 // vector_length * 8
+	b.hi	\exit
+	// if rest <= vector_length * 4
+	lsl	tmp1, vector_length, 1  // vector_length * 2
+	whilelo	p2.b, tmp1, n
+	incb	tmp1
+	whilelo	p3.b, tmp1, n
+	b.last	1f
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
+	st1b	z0.b, p0, [dest, #0, mul vl]
+	st1b	z1.b, p1, [dest, #1, mul vl]
+	st1b	z2.b, p2, [dest, #2, mul vl]
+	st1b	z3.b, p3, [dest, #3, mul vl]
+	ret
+1:	// if rest <= vector_length * 8
+	lsl	tmp1, vector_length, 2  // vector_length * 4
+	whilelo	p4.b, tmp1, n
+	incb	tmp1
+	whilelo	p5.b, tmp1, n
+	b.last	1f
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
+	st1b	z0.b, p0, [dest, #0, mul vl]
+	st1b	z1.b, p1, [dest, #1, mul vl]
+	st1b	z2.b, p2, [dest, #2, mul vl]
+	st1b	z3.b, p3, [dest, #3, mul vl]
+	st1b	z4.b, p4, [dest, #4, mul vl]
+	st1b	z5.b, p5, [dest, #5, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
+	incb	tmp1			// vector_length * 5
+	incb	tmp1			// vector_length * 6
+	whilelo	p6.b, tmp1, n
+	incb	tmp1
+	whilelo	p7.b, tmp1, n
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
+	ld1b	z6.b, p6/z, [src, #6, mul vl]
+	ld1b	z7.b, p7/z, [src, #7, mul vl]
+	st1b	z0.b, p0, [dest, #0, mul vl]
+	st1b	z1.b, p1, [dest, #1, mul vl]
+	st1b	z2.b, p2, [dest, #2, mul vl]
+	st1b	z3.b, p3, [dest, #3, mul vl]
+	st1b	z4.b, p4, [dest, #4, mul vl]
+	st1b	z5.b, p5, [dest, #5, mul vl]
+	st1b	z6.b, p6, [dest, #6, mul vl]
+	st1b	z7.b, p7, [dest, #7, mul vl]
+	ret
+	.endm
+
+ENTRY (MEMCPY)
+
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+L(memcpy):
+	cntb	vector_length
+	// shortcut for less than vector_length * 8
+	// gives a free ptrue to p0.b for n >= vector_length
+	shortcut_for_small_size L(vl_agnostic)
+	// end of shortcut
+
+L(vl_agnostic): // VL Agnostic
+	mov	rest, n
+	mov	dest_ptr, dest
+	mov	src_ptr, src
+	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
+	mov	tmp1, 64
+	cmp	rest, L2_SIZE
+	ccmp	vector_length, tmp1, 0, cs
+	b.eq	L(L2)
+
+L(unroll8): // unrolling and software pipeline
+	lsl	tmp1, vector_length, 3	// vector_length * 8
+	.p2align 3
+	cmp	 rest, tmp1
+	b.cc	L(last)
+	ld1b_unroll8
+	add	src_ptr, src_ptr, tmp1
+	sub	rest, rest, tmp1
+	cmp	rest, tmp1
+	b.cc	2f
+	.p2align 3
+1:	stld1b_unroll8
+	add	dest_ptr, dest_ptr, tmp1
+	add	src_ptr, src_ptr, tmp1
+	sub	rest, rest, tmp1
+	cmp	rest, tmp1
+	b.ge	1b
+2:	st1b_unroll8
+	add	dest_ptr, dest_ptr, tmp1
+
+	.p2align 3
+L(last):
+	whilelo	p0.b, xzr, rest
+	whilelo	p1.b, vector_length, rest
+	b.last	1f
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
+	whilelo	p2.b, tmp1, rest
+	incb	tmp1
+	whilelo	p3.b, tmp1, rest
+	b.last	1f
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
+	whilelo	p4.b, tmp1, rest
+	incb	tmp1
+	whilelo	p5.b, tmp1, rest
+	incb	tmp1
+	whilelo	p6.b, tmp1, rest
+	incb	tmp1
+	whilelo	p7.b, tmp1, rest
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
+	ld1b	z4.b, p4/z, [src_ptr, #4, mul vl]
+	ld1b	z5.b, p5/z, [src_ptr, #5, mul vl]
+	ld1b	z6.b, p6/z, [src_ptr, #6, mul vl]
+	ld1b	z7.b, p7/z, [src_ptr, #7, mul vl]
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
+	st1b	z4.b, p4, [dest_ptr, #4, mul vl]
+	st1b	z5.b, p5, [dest_ptr, #5, mul vl]
+	st1b	z6.b, p6, [dest_ptr, #6, mul vl]
+	st1b	z7.b, p7, [dest_ptr, #7, mul vl]
+	ret
+
+L(L2):
+	// align dest address at CACHE_LINE_SIZE byte boundary
+	mov	tmp1, CACHE_LINE_SIZE
+	ands	tmp2, dest_ptr, CACHE_LINE_SIZE - 1
+	// if cl_remainder == 0
+	b.eq	L(L2_dc_zva)
+	sub	cl_remainder, tmp1, tmp2
+	// process remainder until the first CACHE_LINE_SIZE boundary
+	whilelo	p1.b, xzr, cl_remainder	// keep p0.b all true
+	whilelo	p2.b, vector_length, cl_remainder
+	b.last	1f
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
+	b	2f
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
+	whilelo	p3.b, tmp1, cl_remainder
+	incb	tmp1
+	whilelo	p4.b, tmp1, cl_remainder
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
+	ld1b	z3.b, p3/z, [src_ptr, #2, mul vl]
+	ld1b	z4.b, p4/z, [src_ptr, #3, mul vl]
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
+	st1b	z3.b, p3, [dest_ptr, #2, mul vl]
+	st1b	z4.b, p4, [dest_ptr, #3, mul vl]
+2:	add	dest_ptr, dest_ptr, cl_remainder
+	add	src_ptr, src_ptr, cl_remainder
+	sub	rest, rest, cl_remainder
+
+L(L2_dc_zva):
+	// zero fill
+	and	tmp1, dest, 0xffffffffffffff
+	and	tmp2, src, 0xffffffffffffff
+	subs	tmp1, tmp1, tmp2	// diff
+	b.ge	1f
+	neg	tmp1, tmp1
+1:	mov	tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
+	cmp	tmp1, tmp3
+	b.lo	L(unroll8)
+	mov	tmp1, dest_ptr
+	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
+	// unroll
+	ld1b_unroll8	// this line has to be after "b.lo L(unroll8)"
+	add	 src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+	sub	 rest, rest, CACHE_LINE_SIZE * 2
+	mov	 tmp1, ZF_DIST
+	.p2align 3
+1:	stld1b_unroll4a
+	add	tmp2, dest_ptr, tmp1	// dest_ptr + ZF_DIST
+	dc	zva, tmp2
+	stld1b_unroll4b
+	add	tmp2, tmp2, CACHE_LINE_SIZE
+	dc	zva, tmp2
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+	add	src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+	sub	rest, rest, CACHE_LINE_SIZE * 2
+	cmp	rest, tmp3	// ZF_DIST + CACHE_LINE_SIZE * 2
+	b.ge	1b
+	st1b_unroll8
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+	b	L(unroll8)
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+
+ENTRY (MEMMOVE)
+
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	// remove tag address
+	// dest has to be immutable because it is the return value
+	// src has to be immutable because it is used in L(bwd_last)
+	and	tmp2, dest, 0xffffffffffffff	// save dest_notag into tmp2
+	and	tmp3, src, 0xffffffffffffff	// save src_notag intp tmp3
+	cmp	n, 0
+	ccmp	tmp2, tmp3, 4, ne
+	b.ne	1f
+	ret
+1:	cntb	vector_length
+	// shortcut for less than vector_length * 8
+	// gives a free ptrue to p0.b for n >= vector_length
+	// tmp2 and tmp3 should not be used in this macro to keep
+	// notag addresses
+	shortcut_for_small_size L(dispatch)
+	// end of shortcut
+
+L(dispatch):
+	// tmp2 = dest_notag, tmp3 = src_notag
+	// diff = dest_notag - src_notag
+	sub	tmp1, tmp2, tmp3
+	// if diff <= 0 || diff >= n then memcpy
+	cmp	tmp1, 0
+	ccmp	tmp1, n, 2, gt
+	b.cs	L(vl_agnostic)
+
+L(bwd_start):
+	mov	rest, n
+	add	dest_ptr, dest, n	// dest_end
+	add	src_ptr, src, n		// src_end
+
+L(bwd_unroll8): // unrolling and software pipeline
+	lsl	tmp1, vector_length, 3	// vector_length * 8
+	.p2align 3
+	cmp	rest, tmp1
+	b.cc	L(bwd_last)
+	sub	src_ptr, src_ptr, tmp1
+	ld1b_unroll8
+	sub	rest, rest, tmp1
+	cmp	rest, tmp1
+	b.cc	2f
+	.p2align 3
+1:	sub	src_ptr, src_ptr, tmp1
+	sub	dest_ptr, dest_ptr, tmp1
+	stld1b_unroll8
+	sub	rest, rest, tmp1
+	cmp	rest, tmp1
+	b.ge	1b
+2:	sub	dest_ptr, dest_ptr, tmp1
+	st1b_unroll8
+
+L(bwd_last):
+	mov	dest_ptr, dest
+	mov	src_ptr, src
+	b	L(last)
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+# endif /* IS_IN (libc) */
+#endif /* HAVE_AARCH64_SVE_ASM */
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 12d77818a9..d0adefc547 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+# if HAVE_AARCH64_SVE_ASM
+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
+# endif
 
 libc_ifunc (__libc_memmove,
             (IS_THUNDERX (midr)
@@ -40,12 +43,17 @@ libc_ifunc (__libc_memmove,
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
 		? __memmove_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
-		  ? __memmove_thunderx2
-		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-		     || IS_NEOVERSE_V1 (midr)
-		     ? __memmove_simd
+		   ? __memmove_thunderx2
+		   : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+		      || IS_NEOVERSE_V1 (midr)
+		      ? __memmove_simd
+# if HAVE_AARCH64_SVE_ASM
+		     : (IS_A64FX (midr)
+			? __memmove_a64fx
+			: __memmove_generic))))));
+# else
 		     : __memmove_generic)))));
-
+# endif
 # undef memmove
 strong_alias (__libc_memmove, memmove);
 #endif