aarch64: Optimize string functions with shrn instruction

We found that string functions were using AND+ADDP to find the nibble/syndrome mask but there is an easier opportunity through `SHRN dst.8b, src.8h, 4` (shift right every 2 bytes by 4 and narrow to 1 byte) and has same latency on all SIMD ARMv8 targets as ADDP. There are also possible gaps for memcmp but that's for another patch. We see 10-20% savings for small-mid size cases (<=128) which are primary cases for general workloads. (cherry picked from commit 3c9980698988ef64072f1fac339b180f52792faf)
author: Danila Kutenin <danilak@google.com> 2022-06-27 16:12:13 +0000
committer: Wilco Dijkstra <wilco.dijkstra@arm.com> 2024-04-08 17:25:25 +0100
commit: ea25fe5599068cbadb19e0b2d8640353c32758eb (patch)
tree: 37ca9bfc3d43710e8dfebe1c472a313ae16e511e /sysdeps/aarch64/strcpy.S
parent: 2c4ae9faa5dced399fe9185980056cbbfb016444 (diff)
download: glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.gz
glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.xz
glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.zip
1 files changed, 12 insertions, 20 deletions
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
author	Danila Kutenin <danilak@google.com>	2022-06-27 16:12:13 +0000
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>	2024-04-08 17:25:25 +0100
commit	ea25fe5599068cbadb19e0b2d8640353c32758eb (patch)
tree	37ca9bfc3d43710e8dfebe1c472a313ae16e511e /sysdeps/aarch64/strcpy.S
parent	2c4ae9faa5dced399fe9185980056cbbfb016444 (diff)
download	glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.gz glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.xz glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.zip