PowerPC LE strchr

http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html Adds little-endian support to optimised strchr assembly. I've also tweaked the big-endian code a little. In power7/strchr.S there's a check in the tail of the function that we didn't match 0 before finding a c match, done by comparing leading zero counts. It's just as valid, and quicker, to compare the raw output from cmpb. Another little tweak is to use rldimi/insrdi in place of rlwimi for the power7 strchr functions. Since rlwimi is cracked, it is a few cycles slower. rldimi can be used on the 32-bit power7 functions too. * sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian support. Correct typos, formatting. Optimize tail. Use insrdi rather than rlwimi. * sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise. * sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add little-endian support. Correct typos. * sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise. Use insrdi rather than rlwimi. * sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define. Use in loop and entry code to keep "and." results. (strchr): Add little-endian support. Comment. Move cntlzd earlier in tail. * sysdeps/powerpc/powerpc32/strchr.S: Likewise.
author: Alan Modra <amodra@gmail.com> 2013-08-17 18:46:05 +0930
committer: Alan Modra <amodra@gmail.com> 2013-10-04 10:40:22 +0930
commit: 664318c3eb07032e2bfcf47cb2aa3c89280c19e7 (patch)
tree: 338e8a4e5b1215319560caa795ce5830f2f46685 /sysdeps/powerpc/powerpc32/strchr.S
parent: 43b84013714c46e6dcae4a5564c5527777ad5e08 (diff)
download: glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.gz
glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.xz
glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.zip
1 files changed, 51 insertions, 20 deletions
diff --git a/sysdeps/powerpc/powerpc32/strchr.S b/sysdeps/powerpc/powerpc32/strchr.S
index c9952eeccf..6050565770 100644
--- a/sysdeps/powerpc/powerpc32/strchr.S
+++ b/sysdeps/powerpc/powerpc32/strchr.S
@@ -36,6 +36,8 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 
 	rlwimi	rCHR, rCHR, 8, 16, 23
@@ -49,64 +51,93 @@ ENTRY (strchr)
 	addi	r7F7F, r7F7F, 0x7f7f
 /* Test the first (partial?) word.  */
 	lwz	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rIGN
+#else
 	srw	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):lwzu rWORD, 4(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	lwzu	rWORD, 4(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmplw	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 32-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzw	rCLZB, rTMP2
 	cmplw	rWORD, rTMP2
 	bgtlr
-	cntlzw	rCLZB, rTMP2
+#endif
 	srwi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 32-7-32
+	srawi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzw	rCLZB, rTMP2
 	subi	rSTR, rSTR, 4
 	srwi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)
author	Alan Modra <amodra@gmail.com>	2013-08-17 18:46:05 +0930
committer	Alan Modra <amodra@gmail.com>	2013-10-04 10:40:22 +0930
commit	664318c3eb07032e2bfcf47cb2aa3c89280c19e7 (patch)
tree	338e8a4e5b1215319560caa795ce5830f2f46685 /sysdeps/powerpc/powerpc32/strchr.S
parent	43b84013714c46e6dcae4a5564c5527777ad5e08 (diff)
download	glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.gz glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.xz glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.zip