about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/aarch64/memrchr.S199
1 files changed, 83 insertions, 116 deletions
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index ace5a94e8f..c25f430970 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -22,144 +22,111 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 #define cntin		x2
-
-#define seek_dstin  x11
-#define seek_dst  x12
 #define result		x0
 
 #define src		x3
-#define	tmp		x4
-#define wtmp2		w5
-#define synd		x6
-#define soff		x9
-#define cntrem		x10
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+#define end		x8
+#define endm1		x9
 
 #define vrepchr		v0
-#define vdata1		v1
-#define vdata2		v2
-
-#define vhas_chr1	v3
-#define vhas_chr2	v4
-#define vrepmask	v5
-#define vend		v6
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
 
 /*
- * Core algorithm:
- *
- * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
- * per byte. For each tuple, bit 0 is set if the relevant byte matched the
- * requested character and bit 1 is not used (faster than using a 32bit
- * syndrome). Since the bits in the syndrome reflect exactly the order in which
- * things occur in the original string, counting trailing zeros allows to
- * identify exactly which byte has matched.
- */
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memrchr)
-	/* Do not dereference srcin if no bytes to compare.  */
-	cbz	cntin, L(zero_length)
-	/*
-	 * Magic constant 0x40100401 allows us to identify which lane matches
-	 * the requested byte.
-	 */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	DELOUSE (0)
+	DELOUSE (2)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	/* Work with aligned 32-byte chunks */
-	add	seek_dstin,  cntin, srcin
-	add	tmp, seek_dstin, #31
-	bic	seek_dst, tmp, #31
-	dup	vrepmask.4s, wtmp2
-	ands	soff, seek_dstin, #31
-	mov	tmp, #32
-	sub	soff,tmp,soff
-	and	cntrem, cntin, #31
-	b.eq	L(loop)
-
-	/*
-	 * Input string is not 32-byte aligned. We calculate the syndrome
-	 * value for the aligned 32 bytes block containing the first bytes
-	 * and mask the irrelevant part.
-	 */
-
-	sub	seek_dst, seek_dst, #32
-	ld1	{vdata1.16b, vdata2.16b}, [seek_dst]
-	sub	tmp, soff, #32
-	adds	cntin, cntin, tmp
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
-	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
-	mov	synd, vend.2d[0]
-	/* Clear the (32-soff)*2 upper bits */
-	lsl	tmp, soff, #1
-	lsl	synd, synd, tmp
-	lsr	synd, synd, tmp
-	/* The first block can also be the last */
-	b.ls	L(masklast)
-	/* Have we found something already? */
-	cbnz	synd, L(tail)
-
-L(loop):
-	sub	seek_dst, seek_dst, #32
-	ld1	{vdata1.16b, vdata2.16b}, [seek_dst]
-	subs	cntin, cntin, #32
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* If we're out of data we finish regardless of the result */
-	b.ls	L(end)
-	/* Use a fast check for the termination condition */
-	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend.2d, vend.2d, vend.2d
-	mov	synd, vend.2d[0]
-	/* We're not out of data, loop if we haven't found the character */
-	cbz	synd, L(loop)
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
 
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, end, src
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, -16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
 L(end):
-	/* Termination condition found, let's calculate the syndrome value */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
-	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
-	mov	synd, vend.2d[0]
-	/* Only do the clear for the last possible block */
-	b.hi	L(tail)
-
-L(masklast):
-	/* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
-	add	tmp, cntrem, soff
-	and	tmp, tmp, #31
-	sub	tmp, tmp, #32
-	neg	tmp, tmp, lsl #1
-	lsr	synd, synd, tmp
-	lsl	synd, synd, tmp
-
-L(tail):
-	/* Compensate the last post-increment*/
-	add	seek_dst, seek_dst, #32
-	/* Check that we have found a character */
-	cmp	synd, #0
-	/* And count the leading zeros */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
 	clz	synd, synd
-	/* Compute the potential result */
-	sub	result, seek_dst, synd, lsr #1
-	sub	result, result, #1
-	/* Select result or NULL */
-	csel	result, xzr, result, eq
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
 	ret
 
-L(zero_length):
-	mov	result, #0
+L(nomatch):
+	mov	result, 0
 	ret
+
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
 libc_hidden_builtin_def (memrchr)