AArch64 optimized implementation of strrchr.

author: Richard Earnshaw <Richard.Earnshaw@arm.com> 2015-01-07 11:26:13 +0000
committer: Richard Earnshaw <Richard.Earnshaw@arm.com> 2015-01-07 11:26:13 +0000
commit: ec582ca0f30c963a1c27f405b6732ca8507271d5 (patch)
tree: a073e1a928215f8ab0049668a34422804a6df635
parent: 60f046a82dd4de612115ba6e9abeb87b6508b95b (diff)
download: glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.tar.gz
glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.tar.xz
glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.zip
3 files changed, 173 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 604ce52ec3..212569b1c3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2015-01-07  Richard Earnshaw  <rearnsha@arm.com>
+
+	* sysdeps/aarch64/strrchr.S: New file.
+	* NEWS: Updated.
+
 2015-01-07  Eric Biggers  <ebiggers3@gmail.com>
 
 	[BZ #17658]
diff --git a/NEWS b/NEWS
index bec70dc04c..a233cf9bb8 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,9 @@ Version 2.21
   17744, 17745, 17746, 17747, 17775, 17777, 17780, 17781, 17782, 17793,
   17796, 17797, 17806
 
+* Optimized strchrnul and strrchr implementations for AArch64.
+  Contributed by ARM Ltd.
+
 * i386 memcpy functions optimized with SSE2 unaligned load/store.
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
@@ -71,8 +74,6 @@ Version 2.20
   17084, 17086, 17088, 17092, 17097, 17125, 17135, 17137, 17150, 17153,
   17187, 17213, 17259, 17261, 17262, 17263, 17319, 17325, 17354.
 
-* Optimized strchrnul implementation for AArch64.  Contributed by ARM Ltd.
-
 * Reverted change of ABI data structures for s390 and s390x:
   On s390 and s390x the size of struct ucontext and jmp_buf was increased in
   2.19. This change is reverted in 2.20. The introduced 2.19 symbol versions
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
new file mode 100644
index 0000000000..b49e81dd19
--- /dev/null
+++ b/sysdeps/aarch64/strrchr.S
@@ -0,0 +1,165 @@
+/* strrchr: find the last instance of a character in a string.
+
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+ENTRY(strrchr)
+	cbz	x1, L(null_search)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vhas_nul1.2d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	mov	chr_match, vhas_chr1.2d[0]
+	lsr	tmp3, const_m1, tmp1
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vend1.2d[0]
+	mov	chr_match, vhas_chr1.2d[0]
+	cbz	nul_match, L(loop)
+
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.2d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+L(null_search):
+	b	__strchrnul
+
+END(strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
author	Richard Earnshaw <Richard.Earnshaw@arm.com>	2015-01-07 11:26:13 +0000
committer	Richard Earnshaw <Richard.Earnshaw@arm.com>	2015-01-07 11:26:13 +0000
commit	ec582ca0f30c963a1c27f405b6732ca8507271d5 (patch)
tree	a073e1a928215f8ab0049668a34422804a6df635
parent	60f046a82dd4de612115ba6e9abeb87b6508b95b (diff)
download	glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.tar.gz glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.tar.xz glibc-ec582ca0f30c963a1c27f405b6732ca8507271d5.zip