about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/strcpy.S
diff options
context:
space:
mode:
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>2013-09-26 09:29:19 -0500
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>2013-10-25 13:28:24 -0500
commit69f13dbf06c6195de0ada8632271d58ca3cf55da (patch)
treec2611636127f33fdc43bb51a6ac0f2f97795136c /sysdeps/powerpc/powerpc64/strcpy.S
parent151659f6371ce39a488fd132a5c8ce5e3bba983c (diff)
downloadglibc-69f13dbf06c6195de0ada8632271d58ca3cf55da.tar.gz
glibc-69f13dbf06c6195de0ada8632271d58ca3cf55da.tar.xz
glibc-69f13dbf06c6195de0ada8632271d58ca3cf55da.zip
PowerPC: strcpy/stpcpy optimization for PPC64/POWER7
This patch intends to unify both strcpy and stpcpy implementationsi
for PPC64 and PPC64/POWER7. The idead default powerpc64 implementation
is to provide both doubleword and word aligned memory access.

For PPC64/POWER7 is also provide doubleword and word memory access,
remove the branch hints, use the cmpb instruction for compare
doubleword/words, and add an optimization for inputs of same alignment.
Diffstat (limited to 'sysdeps/powerpc/powerpc64/strcpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/strcpy.S144
1 files changed, 107 insertions, 37 deletions
diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
index a7fd85bad4..793325d7be 100644
--- a/sysdeps/powerpc/powerpc64/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/strcpy.S
@@ -22,25 +22,38 @@
 
 /* char * [r3] strcpy (char *dest [r3], const char *src [r4])  */
 
-EALIGN (strcpy, 4, 0)
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+EALIGN (FUNC_NAME, 4, 0)
 	CALL_MCOUNT 2
 
 #define rTMP	r0
-#define rRTN	r3	/* incoming DEST arg preserved as result */
-#define rSRC	r4	/* pointer to previous word in src */
-#define rDEST	r5	/* pointer to previous word in dest */
+#ifdef USE_AS_STPCPY
+#define rRTN    r3      /* pointer to previous word/doubleword in dest */
+#else
+#define rRTN    r12     /* pointer to previous word/doubleword in dest */
+#endif
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
 #define rWORD	r6	/* current word from src */
-#define rFEFE	r7	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
-#define r7F7F	r8	/* constant 0x7f7f7f7f7f7f7f7f */
-#define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rFEFE	r7	/* constant 0xfefefeff | 0xfefefefefefefeff */
+#define r7F7F	r8	/* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r9	/* ~(word in s1 | r7F7F) */
 #define rALT	r10	/* alternate word from src */
 
-	dcbt	0,rSRC
+#ifndef USE_AS_STPCPY
+/* Save the dst pointer to use as return value.  */
+	mr      rRTN, r3
+#endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	addi	rDEST, rRTN, -8
-	dcbtst	0,rRTN
-	bne	L(unaligned)
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	addi	rRTN, rRTN, -8
 
 	lis	rFEFE, -0x101
 	lis	r7F7F, 0x7f7f
@@ -53,13 +66,13 @@ EALIGN (strcpy, 4, 0)
 	b	L(g2)
 
 L(g0):	ldu	rALT, 8(rSRC)
-	stdu	rWORD, 8(rDEST)
+	stdu	rWORD, 8(rRTN)
 	add	rTMP, rFEFE, rALT
 	nor	rNEG, r7F7F, rALT
 	and.	rTMP, rTMP, rNEG
 	bne-	L(g1)
 	ldu	rWORD, 8(rSRC)
-	stdu	rALT, 8(rDEST)
+	stdu	rALT, 8(rRTN)
 L(g2):	add	rTMP, rFEFE, rWORD
 	nor	rNEG, r7F7F, rWORD
 	and.	rTMP, rTMP, rNEG
@@ -70,77 +83,134 @@ L(g2):	add	rTMP, rFEFE, rWORD
 L(g1):
 #ifdef __LITTLE_ENDIAN__
 	extrdi.	rTMP, rALT, 8, 56
-	stb	rALT, 8(rDEST)
+	stbu	rALT, 8(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 48
-	stb	rTMP, 9(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 40
-	stb	rTMP, 10(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 32
-	stb	rTMP, 11(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 24
-	stb	rTMP, 12(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 16
-	stb	rTMP, 13(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 8
-	stb	rTMP, 14(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi	rTMP, rALT, 8, 0
-	stb	rTMP, 15(rDEST)
-	blr
+	stbu	rTMP, 1(rRTN)
 #else
 	extrdi.	rTMP, rALT, 8, 0
-	stb	rTMP, 8(rDEST)
+	stbu	rTMP, 8(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 8
-	stb	rTMP, 9(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 16
-	stb	rTMP, 10(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 24
-	stb	rTMP, 11(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 32
-	stb	rTMP, 12(rDEST)
-	beqlr-
+	stbu	rTMP, 1(rRTN)
+	beqlr
 	extrdi.	rTMP, rALT, 8, 40
-	stb	rTMP, 13(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 48
-	stb	rTMP, 14(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	stb	rALT, 15(rDEST)
+	stbu	rALT, 1(rRTN)
+#endif
 	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and stores.  */
+	addi	rRTN, rRTN, -4
+
+	lis	rFEFE, -0x101
+	lis	r7F7F, 0x7f7f
+	lwz	rWORD, 0(rSRC)
+	addi	rFEFE, rFEFE, -0x101
+	addi	r7F7F, r7F7F, 0x7f7f
+	b	L(g5)
+
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	add	rTMP, rFEFE, rALT
+	nor	rNEG, r7F7F, rALT
+	and.	rTMP, rTMP, rNEG
+	bne-	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	add	rTMP, rFEFE, rWORD
+	nor	rNEG, r7F7F, rWORD
+	and.	rTMP, rTMP, rNEG
+	beq+	L(g3)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rRTN)
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	stbu	rALT, 1(rRTN)
 #endif
+	blr
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
 	nop
 L(unaligned):
 	lbz	rWORD, 0(rSRC)
-	addi	rDEST, rRTN, -1
+	addi	rRTN, rRTN, -1
 	cmpwi	rWORD, 0
 	beq-	L(u2)
 
 L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rDEST)
+	stbu	rWORD, 1(rRTN)
 	cmpwi	rALT, 0
 	beq-	L(u1)
 	nop		/* Let 601 load start of loop.  */
 	lbzu	rWORD, 1(rSRC)
-	stbu	rALT, 1(rDEST)
+	stbu	rALT, 1(rRTN)
 	cmpwi	rWORD, 0
 	bne+	L(u0)
-L(u2):	stb	rWORD, 1(rDEST)
+L(u2):	stbu	rWORD, 1(rRTN)
 	blr
-L(u1):	stb	rALT, 1(rDEST)
+L(u1):	stbu	rALT, 1(rRTN)
 	blr
+END (FUNC_NAME)
 
-END (strcpy)
+#ifndef USE_AS_STPCPY
 libc_hidden_builtin_def (strcpy)
+#endif