about summary refs log tree commit diff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2023-01-11 13:52:08 +0000
committerWilco Dijkstra <wilco.dijkstra@arm.com>2024-04-08 17:26:45 +0100
commit1c1313dbdda1cba3761f95de0fa2e02cd4b99c23 (patch)
tree23e7ab27157bbd6cc654e92fc7caad6d66d2cba9
parent80ad6cd30250e0c39cee796a99a5c18cc6fe6cf5 (diff)
downloadglibc-1c1313dbdda1cba3761f95de0fa2e02cd4b99c23.tar.gz
glibc-1c1313dbdda1cba3761f95de0fa2e02cd4b99c23.tar.xz
glibc-1c1313dbdda1cba3761f95de0fa2e02cd4b99c23.zip
AArch64: Optimize strchr
Simplify calculation of the mask using shrn.  Unroll the main loop.
Small strings are 20% faster on modern CPUs.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit 51541a229740801882490177fa178e49264b13fb)
-rw-r--r--sysdeps/aarch64/strchr.S52
1 files changed, 24 insertions, 28 deletions
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 003bf4a478..4781d45bd9 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -32,8 +32,7 @@
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -41,39 +40,30 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (strchr)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -87,28 +77,34 @@ ENTRY (strchr)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq