about summary refs log tree commit diff
path: root/sysdeps/aarch64/multiarch/memset_a64fx.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64/multiarch/memset_a64fx.S')
-rw-r--r--sysdeps/aarch64/multiarch/memset_a64fx.S268
1 files changed, 268 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
new file mode 100644
index 0000000000..ce54e5418b
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -0,0 +1,268 @@
+/* Optimized memset for Fujitsu A64FX processor.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+
+/* Assumptions:
+ *
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
+ *
+ */
+
+#define L1_SIZE		(64*1024)	// L1 64KB
+#define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
+#define CACHE_LINE_SIZE	256
+#define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
+#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
+#define rest		x8
+#define vector_length	x9
+#define vl_remainder	x10	// vector_length remainder
+#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
+
+#if HAVE_AARCH64_SVE_ASM
+# if IS_IN (libc)
+#  define MEMSET __memset_a64fx
+
+	.arch armv8.2-a+sve
+
+	.macro dc_zva times
+	dc	zva, tmp1
+	add	tmp1, tmp1, CACHE_LINE_SIZE
+	.if \times-1
+	dc_zva "(\times-1)"
+	.endif
+	.endm
+
+	.macro st1b_unroll first=0, last=7
+	st1b	z0.b, p0, [dst, #\first, mul vl]
+	.if \last-\first
+	st1b_unroll "(\first+1)", \last
+	.endif
+	.endm
+
+	.macro shortcut_for_small_size exit
+	// if rest <= vector_length * 2
+	whilelo	p0.b, xzr, count
+	whilelo	p1.b, vector_length, count
+	b.last	1f
+	st1b	z0.b, p0, [dstin, #0, mul vl]
+	st1b	z0.b, p1, [dstin, #1, mul vl]
+	ret
+1:	// if rest > vector_length * 8
+	cmp	count, vector_length, lsl 3	// vector_length * 8
+	b.hi	\exit
+	// if rest <= vector_length * 4
+	lsl	tmp1, vector_length, 1	// vector_length * 2
+	whilelo	p2.b, tmp1, count
+	incb	tmp1
+	whilelo	p3.b, tmp1, count
+	b.last	1f
+	st1b	z0.b, p0, [dstin, #0, mul vl]
+	st1b	z0.b, p1, [dstin, #1, mul vl]
+	st1b	z0.b, p2, [dstin, #2, mul vl]
+	st1b	z0.b, p3, [dstin, #3, mul vl]
+	ret
+1:	// if rest <= vector_length * 8
+	lsl	tmp1, vector_length, 2	// vector_length * 4
+	whilelo	p4.b, tmp1, count
+	incb	tmp1
+	whilelo	p5.b, tmp1, count
+	b.last	1f
+	st1b	z0.b, p0, [dstin, #0, mul vl]
+	st1b	z0.b, p1, [dstin, #1, mul vl]
+	st1b	z0.b, p2, [dstin, #2, mul vl]
+	st1b	z0.b, p3, [dstin, #3, mul vl]
+	st1b	z0.b, p4, [dstin, #4, mul vl]
+	st1b	z0.b, p5, [dstin, #5, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
+	incb	tmp1			// vector_length * 5
+	incb	tmp1			// vector_length * 6
+	whilelo	p6.b, tmp1, count
+	incb	tmp1
+	whilelo	p7.b, tmp1, count
+	st1b	z0.b, p0, [dstin, #0, mul vl]
+	st1b	z0.b, p1, [dstin, #1, mul vl]
+	st1b	z0.b, p2, [dstin, #2, mul vl]
+	st1b	z0.b, p3, [dstin, #3, mul vl]
+	st1b	z0.b, p4, [dstin, #4, mul vl]
+	st1b	z0.b, p5, [dstin, #5, mul vl]
+	st1b	z0.b, p6, [dstin, #6, mul vl]
+	st1b	z0.b, p7, [dstin, #7, mul vl]
+	ret
+	.endm
+
+ENTRY (MEMSET)
+
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	cbnz	count, 1f
+	ret
+1:	dup	z0.b, valw
+	cntb	vector_length
+	// shortcut for less than vector_length * 8
+	// gives a free ptrue to p0.b for n >= vector_length
+	shortcut_for_small_size L(vl_agnostic)
+	// end of shortcut
+
+L(vl_agnostic): // VL Agnostic
+	mov	rest, count
+	mov	dst, dstin
+	add	dstend, dstin, count
+	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
+	mov	tmp1, 64
+	cmp	rest, L2_SIZE
+	ccmp	vector_length, tmp1, 0, cs
+	b.eq	L(L2)
+	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
+	cmp	rest, L1_SIZE
+	ccmp	vector_length, tmp1, 0, cs
+	b.eq	L(L1_prefetch)
+
+L(unroll32):
+	lsl	tmp1, vector_length, 3	// vector_length * 8
+	lsl	tmp2, vector_length, 5	// vector_length * 32
+	.p2align 3
+1:	cmp	rest, tmp2
+	b.cc	L(unroll8)
+	st1b_unroll
+	add	dst, dst, tmp1
+	st1b_unroll
+	add	dst, dst, tmp1
+	st1b_unroll
+	add	dst, dst, tmp1
+	st1b_unroll
+	add	dst, dst, tmp1
+	sub	rest, rest, tmp2
+	b	1b
+
+L(unroll8):
+	lsl	tmp1, vector_length, 3
+	.p2align 3
+1:	cmp	rest, tmp1
+	b.cc	L(last)
+	st1b_unroll
+	add	dst, dst, tmp1
+	sub	rest, rest, tmp1
+	b	1b
+
+L(last):
+	whilelo	p0.b, xzr, rest
+	whilelo	p1.b, vector_length, rest
+	b.last	1f
+	st1b	z0.b, p0, [dst, #0, mul vl]
+	st1b	z0.b, p1, [dst, #1, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
+	whilelo	p2.b, tmp1, rest
+	incb	tmp1
+	whilelo	p3.b, tmp1, rest
+	b.last	1f
+	st1b	z0.b, p0, [dst, #0, mul vl]
+	st1b	z0.b, p1, [dst, #1, mul vl]
+	st1b	z0.b, p2, [dst, #2, mul vl]
+	st1b	z0.b, p3, [dst, #3, mul vl]
+	ret
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
+	whilelo	p4.b, tmp1, rest
+	incb	tmp1
+	whilelo	p5.b, tmp1, rest
+	incb	tmp1
+	whilelo	p6.b, tmp1, rest
+	incb	tmp1
+	whilelo	p7.b, tmp1, rest
+	st1b	z0.b, p0, [dst, #0, mul vl]
+	st1b	z0.b, p1, [dst, #1, mul vl]
+	st1b	z0.b, p2, [dst, #2, mul vl]
+	st1b	z0.b, p3, [dst, #3, mul vl]
+	st1b	z0.b, p4, [dst, #4, mul vl]
+	st1b	z0.b, p5, [dst, #5, mul vl]
+	st1b	z0.b, p6, [dst, #6, mul vl]
+	st1b	z0.b, p7, [dst, #7, mul vl]
+	ret
+
+L(L1_prefetch): // if rest >= L1_SIZE
+	.p2align 3
+1:	st1b_unroll 0, 3
+	prfm	pstl1keep, [dst, PF_DIST_L1]
+	st1b_unroll 4, 7
+	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
+	add	dst, dst, CACHE_LINE_SIZE * 2
+	sub	rest, rest, CACHE_LINE_SIZE * 2
+	cmp	rest, L1_SIZE
+	b.ge	1b
+	cbnz	rest, L(unroll32)
+	ret
+
+L(L2):
+	// align dst address at vector_length byte boundary
+	sub	tmp1, vector_length, 1
+	ands	tmp2, dst, tmp1
+	// if vl_remainder == 0
+	b.eq	1f
+	sub	vl_remainder, vector_length, tmp2
+	// process remainder until the first vector_length boundary
+	whilelt	p2.b, xzr, vl_remainder
+	st1b	z0.b, p2, [dst]
+	add	dst, dst, vl_remainder
+	sub	rest, rest, vl_remainder
+	// align dstin address at CACHE_LINE_SIZE byte boundary
+1:	mov	tmp1, CACHE_LINE_SIZE
+	ands	tmp2, dst, CACHE_LINE_SIZE - 1
+	// if cl_remainder == 0
+	b.eq	L(L2_dc_zva)
+	sub	cl_remainder, tmp1, tmp2
+	// process remainder until the first CACHE_LINE_SIZE boundary
+	mov	tmp1, xzr       // index
+2:	whilelt	p2.b, tmp1, cl_remainder
+	st1b	z0.b, p2, [dst, tmp1]
+	incb	tmp1
+	cmp	tmp1, cl_remainder
+	b.lo	2b
+	add	dst, dst, cl_remainder
+	sub	rest, rest, cl_remainder
+
+L(L2_dc_zva):
+	// zero fill
+	mov	tmp1, dst
+	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
+	mov	zva_len, ZF_DIST
+	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
+	// unroll
+	.p2align 3
+1:	st1b_unroll 0, 3
+	add	tmp2, dst, zva_len
+	dc	 zva, tmp2
+	st1b_unroll 4, 7
+	add	tmp2, tmp2, CACHE_LINE_SIZE
+	dc	zva, tmp2
+	add	dst, dst, CACHE_LINE_SIZE * 2
+	sub	rest, rest, CACHE_LINE_SIZE * 2
+	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
+	b.ge	1b
+	cbnz	rest, L(unroll8)
+	ret
+
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
+
+#endif /* IS_IN (libc) */
+#endif /* HAVE_AARCH64_SVE_ASM */