about summary refs log tree commit diff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/powerpc/powerpc64/le/power9/strcpy.S160
1 files changed, 89 insertions, 71 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
index 76cfcae200..b9e5afdcbe 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -45,91 +45,78 @@
    The implementation can load bytes past a null terminator, but only
    up to the next 16B boundary, so it never crosses a page.  */
 
+/* Load quadword at addr+offset to vreg, check for null bytes,
+   and branch to label if any are found.  */
+#define CHECK16(vreg,offset,addr,label) \
+	lxv	vreg+32,offset(addr);	\
+	vcmpequb. v6,vreg,v18;	\
+	bne	cr6,L(label);
+
 .machine power9
 ENTRY_TOCLESS (FUNC_NAME, 4)
 	CALL_MCOUNT 2
 
-	/* NULL string optimisation  */
-	lbz	r0,0(r4)
-	stb	r0,0(r3)
-	cmpwi	r0,0
-	beqlr
-
-	addi	r4,r4,1
-	addi	r11,r3,1
-
 	vspltisb v18,0		/* Zeroes in v18  */
+	vspltisb v19,-1 	/* 0xFF bytes in v19  */
 
-	neg	r5,r4
-	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+	/* Next 16B-aligned address. Prepare address for L(loop).  */
+	addi	r5,r4,16
+	clrrdi	r5,r5,4
+	subf	r8,r4,r5
+	add	r11,r3,r8
 
-	/* Get source 16B aligned  */
+	/* Align data and fill bytes not loaded with non matching char.  */
 	lvx	v0,0,r4
 	lvsr	v1,0,r4
-	vperm	v0,v18,v0,v1
-
-	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
-	vctzlsbb r7,v6		/* Number of trailing zeroes  */
-	addi	r8,r7,1		/* Add null terminator  */
+	vperm	v0,v19,v0,v1
 
-	/* r8 = bytes including null
-	   r9 = bytes to get source 16B aligned
-	   if r8 > r9
-	      no null, copy r9 bytes
-	   else
-	      there is a null, copy r8 bytes and return.  */
-	cmpd	r8,r9
-	bgt	L(no_null)
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	beq	cr6,L(no_null)
 
-	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r10	/* Partial store  */
+	/* There's a null byte.  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r9,r8,1 	/* Add null byte.  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
+	stxvl	32+v0,r3,r10	/* Partial store  */
 
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
 	   final '\0'.  */
-	add	r3,r11,r7
+	add	r3,r3,r8
 #endif
 	blr
 
 L(no_null):
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r10	/* Partial store  */
-
-	add	r4,r4,r9
-	add	r11,r11,r9
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r3,r10	/* Partial store  */
 
+	.p2align 4
 L(loop):
-	lxv	32+v0,0(r4)
-	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
-	bne	cr6,L(tail1)
-
-	lxv	32+v1,16(r4)
-	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
-	bne	cr6,L(tail2)
-
-	lxv	32+v2,32(r4)
-	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
-	bne	cr6,L(tail3)
-
-	lxv	32+v3,48(r4)
-	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
-	bne	cr6,L(tail4)
+	CHECK16(v0,0,r5,tail1)
+	CHECK16(v1,16,r5,tail2)
+	CHECK16(v2,32,r5,tail3)
+	CHECK16(v3,48,r5,tail4)
+	CHECK16(v4,64,r5,tail5)
+	CHECK16(v5,80,r5,tail6)
 
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
 	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
 
-	addi	r4,r4,64
-	addi	r11,r11,64
+	addi	r5,r5,96
+	addi	r11,r11,96
 
 	b	L(loop)
 
+	.p2align 4
 L(tail1):
-	vctzlsbb r8,v6
-	addi	r9,r8,1
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r9,r8,1		/* Add null terminator  */
 	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r9
+	stxvl	32+v0,r11,r9	/* Partial store  */
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
 	   final '\0'.  */
@@ -137,50 +124,81 @@ L(tail1):
 #endif
 	blr
 
+	.p2align 4
 L(tail2):
 	stxv	32+v0,0(r11)
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	vctzlsbb r8,v6
+	addi	r9,r8,1
+	sldi	r9,r9,56
 	addi	r11,r11,16
-	stxvl	32+v1,r11,r10	/* Partial store  */
+	stxvl	32+v1,r11,r9
 #ifdef USE_AS_STPCPY
-	/* stpcpy returns the dest address plus the size not counting the
-	   final '\0'.  */
 	add	r3,r11,r8
 #endif
 	blr
 
+	.p2align 4
 L(tail3):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	vctzlsbb r8,v6
+	addi	r9,r8,1
+	sldi	r9,r9,56
 	addi	r11,r11,32
-	stxvl	32+v2,r11,r10	/* Partial store  */
+	stxvl	32+v2,r11,r9
 #ifdef USE_AS_STPCPY
-	/* stpcpy returns the dest address plus the size not counting the
-	   final '\0'.  */
 	add	r3,r11,r8
 #endif
 	blr
 
+	.p2align 4
 L(tail4):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	vctzlsbb r8,v6
+	addi	r9,r8,1
+	sldi	r9,r9,56
 	addi	r11,r11,48
-	stxvl	32+v3,r11,r10	/* Partial store  */
+	stxvl	32+v3,r11,r9
 #ifdef USE_AS_STPCPY
-	/* stpcpy returns the dest address plus the size not counting the
-	   final '\0'.  */
 	add	r3,r11,r8
 #endif
 	blr
+
+	.p2align 4
+L(tail5):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	vctzlsbb r8,v6
+	addi	r9,r8,1
+	sldi	r9,r9,56
+	addi	r11,r11,64
+	stxvl	32+v4,r11,r9
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail6):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	vctzlsbb r8,v6
+	addi	r9,r8,1
+	sldi	r9,r9,56
+	addi	r11,r11,80
+	stxvl	32+v5,r11,r9
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
 END (FUNC_NAME)
 #ifndef USE_AS_STPCPY
 libc_hidden_builtin_def (strcpy)