about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64')
-rw-r--r--sysdeps/powerpc/powerpc64/le/Makefile4
-rw-r--r--sysdeps/powerpc/powerpc64/le/power9/strcpy.S276
-rw-r--r--sysdeps/powerpc/powerpc64/sysdep.h19
3 files changed, 246 insertions, 53 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/Makefile b/sysdeps/powerpc/powerpc64/le/Makefile
index 9d568d4f44..b77775cf95 100644
--- a/sysdeps/powerpc/powerpc64/le/Makefile
+++ b/sysdeps/powerpc/powerpc64/le/Makefile
@@ -129,6 +129,10 @@ CFLAGS-tst-strtod-round.c += $(type-float128-CFLAGS)
 CFLAGS-tst-wcstod-round.c += $(type-float128-CFLAGS)
 CFLAGS-tst-strtod-nan-locale.c += $(type-float128-CFLAGS)
 CFLAGS-tst-wcstod-nan-locale.c += $(type-float128-CFLAGS)
+CFLAGS-tst-strtod1i.c += $(type-float128-CFLAGS)
+CFLAGS-tst-strtod3.c += $(type-float128-CFLAGS)
+CFLAGS-tst-strtod4.c += $(type-float128-CFLAGS)
+CFLAGS-tst-strtod5i.c += $(type-float128-CFLAGS)
 CFLAGS-tst-strtod6.c += $(type-float128-CFLAGS)
 CFLAGS-tst-strfrom.c += $(type-float128-CFLAGS)
 CFLAGS-tst-strfrom-locale.c += $(type-float128-CFLAGS)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
index 603bde1e39..2f50625a19 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -42,22 +42,48 @@
 
    if USE_AS_STPCPY is defined.
 
-   The implementation can load bytes past a null terminator, but only
-   up to the next 16B boundary, so it never crosses a page.  */
+   This implementation never reads across a page boundary, but may
+   read beyond the NUL terminator.  */
 
-/* Load quadword at addr+offset to vreg, check for null bytes,
+/* Load 4 quadwords, merge into one VR for speed and check for NUL
+   and branch to label if NUL is found.  */
+#define CHECK_64B(offset,addr,label)		\
+	lxv	32+v4,(offset+0)(addr);		\
+	lxv	32+v5,(offset+16)(addr);	\
+	lxv	32+v6,(offset+32)(addr);	\
+	lxv	32+v7,(offset+48)(addr);	\
+	vminub	v14,v4,v5;			\
+	vminub	v15,v6,v7;			\
+	vminub	v16,v14,v15;			\
+	vcmpequb.	v0,v16,v18;		\
+	beq	cr6,$+12;			\
+	li	r7,offset;			\
+	b	L(label);			\
+	stxv	32+v4,(offset+0)(r11);		\
+	stxv	32+v5,(offset+16)(r11);		\
+	stxv	32+v6,(offset+32)(r11);		\
+	stxv	32+v7,(offset+48)(r11)
+
+/* Load quadword at addr+offset to vreg, check for NUL bytes,
    and branch to label if any are found.  */
-#define CHECK16(vreg,offset,addr,label) \
-	lxv	vreg+32,offset(addr);	\
-	vcmpequb. v6,vreg,v18;	\
+#define CHECK_16B(vreg,offset,addr,label)	\
+	lxv	vreg+32,offset(addr);		\
+	vcmpequb.	v15,vreg,v18;		\
 	bne	cr6,L(label);
 
-.machine power9
+/* Store vreg2 with length if NUL is found.  */
+#define STORE_WITH_LEN(vreg1,vreg2,reg)	\
+	vctzlsbb	r8,vreg1;		\
+	addi	r9,r8,1;			\
+	sldi	r9,r9,56;			\
+	stxvl	32+vreg2,reg,r9;
+
+.machine	power9
 ENTRY_TOCLESS (FUNC_NAME, 4)
 	CALL_MCOUNT 2
 
-	vspltisb v18,0		/* Zeroes in v18  */
-	vspltisb v19,-1 	/* 0xFF bytes in v19  */
+	vspltisb	v18,0		/* Zeroes in v18.  */
+	vspltisb	v19,-1		/* 0xFF bytes in v19.  */
 
 	/* Next 16B-aligned address. Prepare address for L(loop).  */
 	addi	r5,r4,16
@@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
 	lvsr	v1,0,r4
 	vperm	v0,v19,v0,v1
 
-	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vcmpequb.	v6,v0,v18	/* 0xff if byte is NUL, 0x00 otherwise.  */
 	beq	cr6,L(no_null)
 
-	/* There's a null byte.  */
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1 	/* Add null byte.  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v6,v0,r3)
 
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
@@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
 	blr
 
 L(no_null):
-	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
+	stxvl	32+v0,r3,r10	/* Partial store.  */
 
+/* The main loop is optimized for longer strings(> 512 bytes),
+   so checking the first bytes in 16B chunks benefits shorter
+   strings a lot.  */
 	.p2align 4
-L(loop):
-	CHECK16(v0,0,r5,tail1)
-	CHECK16(v1,16,r5,tail2)
-	CHECK16(v2,32,r5,tail3)
-	CHECK16(v3,48,r5,tail4)
-	CHECK16(v4,64,r5,tail5)
-	CHECK16(v5,80,r5,tail6)
+L(aligned):
+	CHECK_16B(v0,0,r5,tail1)
+	CHECK_16B(v1,16,r5,tail2)
+	CHECK_16B(v2,32,r5,tail3)
+	CHECK_16B(v3,48,r5,tail4)
+	CHECK_16B(v4,64,r5,tail5)
+	CHECK_16B(v5,80,r5,tail6)
+	CHECK_16B(v6,96,r5,tail7)
+	CHECK_16B(v7,112,r5,tail8)
 
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
@@ -105,21 +133,146 @@ L(loop):
 	stxv	32+v3,48(r11)
 	stxv	32+v4,64(r11)
 	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
 
-	addi	r5,r5,96
-	addi	r11,r11,96
+	addi	r11,r11,128
+
+	CHECK_16B(v0,128,r5,tail1)
+	CHECK_16B(v1,128+16,r5,tail2)
+	CHECK_16B(v2,128+32,r5,tail3)
+	CHECK_16B(v3,128+48,r5,tail4)
+	CHECK_16B(v4,128+64,r5,tail5)
+	CHECK_16B(v5,128+80,r5,tail6)
+	CHECK_16B(v6,128+96,r5,tail7)
+	CHECK_16B(v7,128+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	addi	r11,r11,128
+
+	CHECK_16B(v0,256,r5,tail1)
+	CHECK_16B(v1,256+16,r5,tail2)
+	CHECK_16B(v2,256+32,r5,tail3)
+	CHECK_16B(v3,256+48,r5,tail4)
+	CHECK_16B(v4,256+64,r5,tail5)
+	CHECK_16B(v5,256+80,r5,tail6)
+	CHECK_16B(v6,256+96,r5,tail7)
+	CHECK_16B(v7,256+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	addi	r11,r11,128
+
+	CHECK_16B(v0,384,r5,tail1)
+	CHECK_16B(v1,384+16,r5,tail2)
+	CHECK_16B(v2,384+32,r5,tail3)
+	CHECK_16B(v3,384+48,r5,tail4)
+	CHECK_16B(v4,384+64,r5,tail5)
+	CHECK_16B(v5,384+80,r5,tail6)
+	CHECK_16B(v6,384+96,r5,tail7)
+	CHECK_16B(v7,384+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	/* Align src pointer down to a 64B boundary.  */
+	addi	r5,r4,512
+	clrrdi	r5,r5,6
+	subf	r7,r4,r5
+	add	r11,r3,r7
+
+/* Switch to a more aggressive approach checking 64B each time.  */
+	.p2align 5
+L(strcpy_loop):
+	CHECK_64B(0,r5,tail_64b)
+	CHECK_64B(64,r5,tail_64b)
+	CHECK_64B(128,r5,tail_64b)
+	CHECK_64B(192,r5,tail_64b)
+
+	CHECK_64B(256,r5,tail_64b)
+	CHECK_64B(256+64,r5,tail_64b)
+	CHECK_64B(256+128,r5,tail_64b)
+	CHECK_64B(256+192,r5,tail_64b)
+	addi	r5,r5,512
+	addi	r11,r11,512
+
+	b	L(strcpy_loop)
+
+	.p2align 5
+L(tail_64b):
+	/* OK, we found a NUL byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	add	r11,r11,r7
+	vcmpequb.	v8,v4,v18
+	beq	cr6,L(no_null_16B)
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v8,v4,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+L(no_null_16B):
+	stxv	32+v4,0(r11)
+	vcmpequb.	v8,v5,v18
+	beq	cr6,L(no_null_32B)
+	/* There's a NUL byte.  */
+	addi	r11,r11,16
+	STORE_WITH_LEN(v8,v5,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
 
-	b	L(loop)
+L(no_null_32B):
+	stxv	32+v5,16(r11)
+	vcmpequb.	v8,v6,v18
+	beq	cr6,L(no_null_48B)
+	/* There's a NUL byte.  */
+	addi	r11,r11,32
+	STORE_WITH_LEN(v8,v6,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+L(no_null_48B):
+	stxv	32+v6,32(r11)
+	vcmpequb.	v8,v7,v18;
+	/* There's a NUL byte.  */
+	addi	r11,r11,48
+	STORE_WITH_LEN(v8,v7,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
 
 	.p2align 4
 L(tail1):
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r9	/* Partial store  */
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v15,v0,r11)
 #ifdef USE_AS_STPCPY
-	/* stpcpy returns the dest address plus the size not counting the
-	   final '\0'.  */
 	add	r3,r11,r8
 #endif
 	blr
@@ -127,11 +280,9 @@ L(tail1):
 	.p2align 4
 L(tail2):
 	stxv	32+v0,0(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
+	/* There's a NUL byte.  */
 	addi	r11,r11,16
-	stxvl	32+v1,r11,r9
+	STORE_WITH_LEN(v15,v1,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -141,11 +292,8 @@ L(tail2):
 L(tail3):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,32
-	stxvl	32+v2,r11,r9
+	STORE_WITH_LEN(v15,v2,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -156,11 +304,8 @@ L(tail4):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,48
-	stxvl	32+v3,r11,r9
+	STORE_WITH_LEN(v15,v3,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -172,11 +317,8 @@ L(tail5):
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
 	stxv	32+v3,48(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,64
-	stxvl	32+v4,r11,r9
+	STORE_WITH_LEN(v15,v4,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -189,11 +331,39 @@ L(tail6):
 	stxv	32+v2,32(r11)
 	stxv	32+v3,48(r11)
 	stxv	32+v4,64(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,80
-	stxvl	32+v5,r11,r9
+	STORE_WITH_LEN(v15,v5,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail7):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	addi	r11,r11,96
+	STORE_WITH_LEN(v15,v6,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail8):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	addi	r11,r11,112
+	STORE_WITH_LEN(v15,v7,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index c363939e1a..c439b06121 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -353,6 +353,25 @@ LT_LABELSUFFIX(name,_name_end): ; \
   DO_CALL (SYS_ify (syscall_name))
 
 #ifdef SHARED
+# define TAIL_CALL_NO_RETURN(__func) \
+    b JUMPTARGET (NOTOC (__func))
+#else
+# define TAIL_CALL_NO_RETURN(__func) \
+    .ifdef .Local ## __func; \
+    b .Local ## __func; \
+    .else; \
+.Local ## __func: \
+    mflr 0; \
+    std 0,FRAME_LR_SAVE(1); \
+    stdu 1,-FRAME_MIN_SIZE(1); \
+    cfi_adjust_cfa_offset(FRAME_MIN_SIZE); \
+    cfi_offset(lr,FRAME_LR_SAVE); \
+    bl JUMPTARGET(__func); \
+    nop; \
+    .endif
+#endif
+
+#ifdef SHARED
 #define TAIL_CALL_SYSCALL_ERROR \
     b JUMPTARGET (NOTOC (__syscall_error))
 #else