about summary refs log tree commit diff
path: root/sysdeps/aarch64/multiarch/memchr_nosimd.S
diff options
context:
space:
mode:
authorFeng Xue <fxue@os.amperecomputing.com>2018-08-09 04:38:03 -0400
committerFeng Xue <fxue@os.amperecomputing.com>2019-02-01 08:14:21 -0500
commit83d1cc42d8e6b18a4b6ba53addfdae98c694ea36 (patch)
tree31baf4b5309964aadf357cb81b21f5e6614a983e /sysdeps/aarch64/multiarch/memchr_nosimd.S
parentc7d3890ff51bceb38fac0947ce1f2bb0c34f6b15 (diff)
downloadglibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.gz
glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.xz
glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.zip
aarch64: Optimized memchr specific to AmpereComputing emag
This version uses general register based memory instruction to load
data, because vector register based is slightly slower in emag.

Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.

    * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
    [!MEMCHR](MEMCHR): Set to __memchr.
    * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
    Add memchr_generic and memchr_nosimd.
    * sysdeps/aarch64/multiarch/ifunc-impl-list.c
    (__libc_ifunc_impl_list): Add memchr ifuncs.
    * sysdeps/aarch64/multiarch/memchr.c: New file.
    * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
    * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
Diffstat (limited to 'sysdeps/aarch64/multiarch/memchr_nosimd.S')
-rw-r--r--sysdeps/aarch64/multiarch/memchr_nosimd.S223
1 files changed, 223 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S
new file mode 100644
index 0000000000..5ce8eb7625
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S
@@ -0,0 +1,223 @@
+/* memchr - find a character in a memory zone using base integer registers
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Use base integer registers.
+ */
+
+#ifndef MEMCHR
+# define MEMCHR __memchr_nosimd
+#endif
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		x1
+#define cntin		x2
+
+#define result		x0
+
+#define repchr		x1
+
+#define tmp1		x2
+#define tmp2		x3
+#define tmp3		x4
+#define tmp4		x5
+
+#define src		x6
+#define srcend		x7
+#define srcend16	x8
+
+#define anymore		x9
+
+#define zeroones	x10
+
+#define data1		x11
+#define data2		x12
+
+#define has_chr1	x13
+#define has_chr2	x14
+
+#define REP8_01		0x0101010101010101
+#define REP8_7f		0x7f7f7f7f7f7f7f7f
+
+
+ENTRY_ALIGN (MEMCHR, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	/* Do not dereference srcin if no bytes to compare. */
+	cbz	cntin, L(none_chr)
+
+	/* Start address is 16-byte aligned or not? */
+	tst	srcin, 15
+	bic	src, srcin, 15
+
+	mov	zeroones, REP8_01
+	and	repchr, chrin, 255
+	/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
+	mul	repchr, repchr, zeroones
+
+	add	srcend, srcin, cntin
+	/*
+	 * srcend16 is address of the block following the last block.
+	 *
+	 * [A block is 16-byte aligned and sized.]
+	 */
+	add	srcend16, srcend, 15
+	bic	srcend16, srcend16, 15
+
+	b.eq	L(loop)
+
+	/* Load the first block containing start address. */
+	ldp	data1, data2, [src], 16
+
+	lsl	tmp1, srcin, 3
+	mov	tmp2, ~0
+#ifdef __AARCH64EB__
+	lsr	tmp3, tmp2, tmp1
+#else
+	lsl	tmp3, tmp2, tmp1
+#endif
+	/* Start address is in the first or the second qword? */
+	tst	srcin, 8
+
+	/*
+	 * Transform any byte in the block to zero using XOR operation,
+	 * if that byte equals the char to search. In this way, searching
+	 * the char becomes detecting zero in the resulting two qwords.
+	 */
+	eor	data1, data1, repchr
+	eor	data2, data2, repchr
+
+	/*
+	 * Set those unused bytes(before start address) to 0xff, so
+	 * that they will not hit any zero detection.
+	 */
+	orn	tmp1, data1, tmp3
+	orn	tmp2, data2, tmp3
+
+	csinv	data1, tmp1, xzr, eq
+	csel	data2, data2, tmp2, eq
+
+	/*
+	 * When the first and last block are the same, there are two cases:
+	 *  o. Memory range to search is just in one block.
+	 *      ( start address - end address) < 0
+	 *
+	 *  o. Memory range is so large that end address wrap-around.
+	 *      ( start address - end address) > 0
+	 */
+	cmp	srcin, srcend
+	ccmp	src, srcend16, 0, mi
+	csetm	anymore, ne
+	b	L(find_chr)
+
+	.p2align 4
+L(loop):
+	ldp	data1, data2, [src], 16
+
+	subs	anymore, src, srcend16
+
+	/*
+	 * Transform any byte in the block to zero using XOR operation,
+	 * if that byte equals the char to search.
+	 */
+	eor	data1, data1, repchr
+	eor	data2, data2, repchr
+
+L(find_chr):
+	/*
+	 * Use the following integer test to find out if any byte in a
+	 * qword is zero. If do not contain zero-valued byte, test result
+	 * is zero.
+	 *
+	 *  (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
+	 * =
+	 *  (qword - 0x0101010101010101) & ~(qword  | 0x7f7f7f7f7f7f7f7f)
+	 *
+	 */
+	sub	tmp1, data1, zeroones
+	sub	tmp2, data2, zeroones
+
+	orr	tmp3, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+
+	bic	has_chr1, tmp1, tmp3
+	bic	has_chr2, tmp2, tmp4
+
+	orr	tmp1, has_chr1, has_chr2
+	ccmp	tmp1, 0, 0, ne
+
+	b.eq	L(loop)
+
+	cbz	has_chr1, 1f
+	sub	result, src, 16
+#ifdef __AARCH64EB__
+	rev	data1, data1
+#else
+	rev	has_chr1, has_chr1
+#endif
+	b	L(done)
+
+1:	cbz	has_chr2, L(none_chr)
+	sub	result, src, 8
+#ifdef __AARCH64EB__
+	rev	data1, data2
+#else
+	rev	has_chr1, has_chr2
+#endif
+
+L(done):
+#ifdef __AARCH64EB__
+	/*
+	 * For big-endian, can not directly use has_chr1/has_chr2 because
+	 * two qwords has been reversed after loading from memory.
+	 * Thus, have to perform char detection on two qwords again, which
+	 * should be byte-swapped this time.
+	 */
+	sub	tmp1, data1, zeroones
+	orr	tmp3, data1, REP8_7f
+	bic	has_chr1, tmp1, tmp3
+	rev	has_chr1, has_chr1
+#endif
+
+	/*
+	 * If the specified char is found in a qword, the corresponding
+	 * byte of in has_chr has value of 1, while this is only true for
+	 * the first occurrence, not other occurrences.
+	 */
+	cmp	anymore, 0
+	clz	tmp1, has_chr1
+	add	result, result, tmp1, lsr 3
+	ccmp	result, srcend, 8, eq	/* NZCV = 8000 */
+	csel	result, result, xzr, mi
+	ret
+
+L(none_chr):
+	mov	result, 0
+	ret
+
+END (MEMCHR)
+libc_hidden_builtin_def (MEMCHR)