about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--sysdeps/arm/armv7/multiarch/Makefile2
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-impl-list.c5
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr.S59
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_impl.S219
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_neon.S9
6 files changed, 304 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 8ef682af42..ad24611cfa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2017-06-27  Prakhar Bahuguna  <prakhar.bahuguna@arm.com>
+
+	* sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to
+	sysdep_routines.
+	* sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for
+	__memchr_neon.
+	Add ifunc definitions for __memchr_neon and __memchr_noneon.
+	* sysdeps/arm/armv7/multiarch/memchr.S: New file.
+	* sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise.
+	* sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise.
+
 2017-06-27  Stefan Liebler  <stli@linux.vnet.ibm.com>
 
 	* sysdeps/s390/utf8-utf16-z9.c (__to_utf8_loop_vx_cu):
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
index e834cc937f..9e1e61c21a 100644
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -1,3 +1,3 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon
 endif
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
index b8094fd393..8f33156317 100644
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   bool use_neon = true;
 #ifdef __ARM_NEON__
 # define __memcpy_neon	memcpy
+# define __memchr_neon	memchr
 #else
   use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
 #endif
@@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
 
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
   return i;
 }
diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S
new file mode 100644
index 0000000000..8e8097abd5
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if IS_IN (libc)
+/* Under __ARM_NEON__, memchr_neon.S defines the name memchr.  */
+# ifndef __ARM_NEON__
+	.text
+	.arm
+ENTRY(memchr)
+	.type	memchr, %gnu_indirect_function
+	ldr	r1, .Lmemchr_noneon
+	tst	r0, #HWCAP_ARM_NEON
+	ldrne	r1, .Lmemchr_neon
+1:
+	add	r0, r1, pc
+	DO_RET(lr)
+
+.Lmemchr_noneon:
+	.long	C_SYMBOL_NAME(__memchr_noneon) - 1b - 8
+.Lmemchr_neon:
+	.long	C_SYMBOL_NAME(__memchr_neon) - 1b - 8
+
+END(memchr)
+
+libc_hidden_builtin_def (memchr)
+# endif  /* Not __ARM_NEON__.  */
+libc_hidden_def (__memchr_noneon)
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+# undef weak_alias
+# define weak_alias(x, y)
+# undef libc_hidden_def
+# define libc_hidden_def(name)
+
+# define memchr __memchr_noneon
+
+#endif
+
+#include "memchr_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S
new file mode 100644
index 0000000000..e8cbb97368
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_impl.S
@@ -0,0 +1,219 @@
+/* memchr implemented using NEON.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef MEMCHR_NEON
+
+#include <sysdep.h>
+
+	.arch	armv7-a
+	.fpu	neon
+
+
+/* Arguments */
+#define srcin		r0
+#define chrin		r1
+#define cntin		r2
+
+/* Retval */
+#define result		r0	/* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src		r1	/* Live range does not overlap with chrin */
+#define tmp		r3
+#define synd		r0	/* No overlap with srcin or result */
+#define soff		r12
+
+/* Working NEON registers */
+#define vrepchr		q0
+#define vdata0		q1
+#define vdata0_0	d2	/* Lower half of vdata0 */
+#define vdata0_1	d3	/* Upper half of vdata0 */
+#define vdata1		q2
+#define vdata1_0	d4	/* Lower half of vhas_chr0 */
+#define vdata1_1	d5	/* Upper half of vhas_chr0 */
+#define vrepmask	q3
+#define vrepmask0	d6
+#define vrepmask1	d7
+#define vend		q4
+#define vend0		d8
+#define vend1		d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+#ifndef NO_THUMB
+	.thumb_func
+#else
+	.arm
+#endif
+	.p2align 4,,15
+
+ENTRY(memchr)
+	/* Use a simple loop if there are less than 8 bytes to search.  */
+	cmp	cntin, #7
+	bhi	.Llargestr
+	and	chrin, chrin, #0xff
+
+.Lsmallstr:
+	subs	cntin, cntin, #1
+	blo	.Lnotfound	/* Return not found if reached end.  */
+	ldrb	tmp, [srcin], #1
+	cmp	tmp, chrin
+	bne	.Lsmallstr	/* Loop again if not found.  */
+	/* Otherwise fixup address and return.  */
+	sub	result, srcin, #1
+	bx	lr
+
+
+.Llargestr:
+	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
+	/*
+	 * Magic constant 0x8040201008040201 allows us to identify which lane
+	 * matches the requested byte.
+	 */
+	movw	tmp, #0x0201
+	movt	tmp, #0x0804
+	lsl	soff, tmp, #4
+	vmov	vrepmask0, tmp, soff
+	vmov	vrepmask1, tmp, soff
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	ands	soff, srcin, #31
+	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	sub		tmp, soff, #32
+	adds		cntin, cntin, tmp
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+
+	/* Clear the soff lower bits */
+	lsr		synd, synd, soff
+	lsl		synd, synd, soff
+	/* The first block can also be the last */
+	bls		.Lmasklast
+	/* Have we found something already? */
+#ifndef NO_THUMB
+	cbnz		synd, .Ltail
+#else
+	cmp		synd, #0
+	bne		.Ltail
+#endif
+
+
+.Lloopintro:
+	vpush	{vend}
+	/* 264/265 correspond to d8/d9 for q4 */
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (264, 0)
+	cfi_rel_offset (265, 8)
+	.p2align 3,,7
+.Lloop:
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	subs		cntin, cntin, #32
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	/* If we're out of data we finish regardless of the result. */
+	bls		.Lend
+	/* Use a fast check for the termination condition. */
+	vorr		vend, vdata0, vdata1
+	vorr		vend0, vend0, vend1
+	vmov		synd, tmp, vend0
+	orrs		synd, synd, tmp
+	/* We're not out of data, loop if we haven't found the character. */
+	beq		.Lloop
+
+.Lend:
+	vpop		{vend}
+	cfi_adjust_cfa_offset (-16)
+	cfi_restore (264)
+	cfi_restore (265)
+
+	/* Termination condition found, let's calculate the syndrome value. */
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+#ifndef NO_THUMB
+	cbz		synd, .Lnotfound
+	bhi		.Ltail	/* Uses the condition code from
+				   subs cntin, cntin, #32 above.  */
+#else
+	cmp		synd, #0
+	beq		.Lnotfound
+	cmp		cntin, #0
+	bhi		.Ltail
+#endif
+
+
+.Lmasklast:
+	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+	neg	cntin, cntin
+	lsl	synd, synd, cntin
+	lsrs	synd, synd, cntin
+	it	eq
+	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result and return */
+	add	result, src, synd
+	bx	lr
+
+
+.Lnotfound:
+	/* Set result to NULL if not found and return */
+	mov	result, #0
+	bx	lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
+
+#else
+
+#include "../../armv6t2/memchr.S"
+
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S
new file mode 100644
index 0000000000..ee21818f10
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S
@@ -0,0 +1,9 @@
+#ifdef __ARM_NEON__
+/* Under __ARM_NEON__, this file defines memchr directly.  */
+libc_hidden_builtin_def (memchr)
+#else
+# define memchr __memchr_neon
+#endif
+
+#define MEMCHR_NEON
+#include "memchr_impl.S"