diff options
author | Alan Modra <amodra@gmail.com> | 2013-08-17 18:40:11 +0930 |
---|---|---|
committer | Alan Modra <amodra@gmail.com> | 2013-10-04 10:39:32 +0930 |
commit | db9b4570c5dc550074140ac1d1677077fba29a26 (patch) | |
tree | c6469b8ce6b7ec28cc9f7c27484e67d351ce3349 /sysdeps/powerpc/powerpc32/strlen.S | |
parent | f7c399cff5bd04ee9dc117fb6b0f39597dc047c6 (diff) | |
download | glibc-db9b4570c5dc550074140ac1d1677077fba29a26.tar.gz glibc-db9b4570c5dc550074140ac1d1677077fba29a26.tar.xz glibc-db9b4570c5dc550074140ac1d1677077fba29a26.zip |
PowerPC LE strlen
http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html This is the first of nine patches adding little-endian support to the existing optimised string and memory functions. I did spend some time with a power7 simulator looking at cycle by cycle behaviour for memchr, but most of these patches have not been run on cpu simulators to check that we are going as fast as possible. I'm sure PowerPC can do better. However, the little-endian support mostly leaves main loops unchanged, so I'm banking on previous authors having done a good job on big-endian.. As with most code you stare at long enough, I found some improvements for big-endian too. Little-endian support for strlen. Like most of the string functions, I leave the main word or multiple-word loops substantially unchanged, just needing to modify the tail. Removing the branch in the power7 functions is just a tidy. .align produces a branch anyway. Modifying regs in the non-power7 functions is to suit the new little-endian tail. * sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian support. Don't branch over align. * sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise. * sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support. Rearrange tmp reg use to suit. Comment. * sysdeps/powerpc/powerpc32/strlen.S: Likewise.
Diffstat (limited to 'sysdeps/powerpc/powerpc32/strlen.S')
-rw-r--r-- | sysdeps/powerpc/powerpc32/strlen.S | 69 |
1 files changed, 52 insertions, 17 deletions
diff --git a/sysdeps/powerpc/powerpc32/strlen.S b/sysdeps/powerpc/powerpc32/strlen.S index 9a6eafc382..a7153ed7a2 100644 --- a/sysdeps/powerpc/powerpc32/strlen.S +++ b/sysdeps/powerpc/powerpc32/strlen.S @@ -29,7 +29,12 @@ 1 is subtracted you get a value in the range 0x00-0x7f, none of which have their high bit set. The expression here is (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when - there were no 0x00 bytes in the word. + there were no 0x00 bytes in the word. You get 0x80 in bytes that + match, but possibly false 0x80 matches in the next more significant + byte to a true match due to carries. For little-endian this is + of no consequence since the least significant match is the one + we're interested in, but big-endian needs method 2 to find which + byte matches. 2) Given a word 'x', we can test to see _which_ byte was zero by calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). @@ -72,7 +77,7 @@ ENTRY (strlen) -#define rTMP1 r0 +#define rTMP4 r0 #define rRTN r3 /* incoming STR arg, outgoing result */ #define rSTR r4 /* current string position */ #define rPADN r5 /* number of padding bits we prepend to the @@ -82,9 +87,9 @@ ENTRY (strlen) #define rWORD1 r8 /* current string word */ #define rWORD2 r9 /* next string word */ #define rMASK r9 /* mask for first string word */ -#define rTMP2 r10 -#define rTMP3 r11 -#define rTMP4 r12 +#define rTMP1 r10 +#define rTMP2 r11 +#define rTMP3 r12 clrrwi rSTR, rRTN, 2 @@ -93,15 +98,20 @@ ENTRY (strlen) lwz rWORD1, 0(rSTR) li rMASK, -1 addi r7F7F, r7F7F, 0x7f7f -/* That's the setup done, now do the first pair of words. - We make an exception and use method (2) on the first two words, to reduce - overhead. */ +/* We use method (2) on the first two words, because rFEFE isn't + required which reduces setup overhead. Also gives a faster return + for small strings on big-endian due to needing to recalculate with + method (2) anyway. */ +#ifdef __LITTLE_ENDIAN__ + slw rMASK, rMASK, rPADN +#else srw rMASK, rMASK, rPADN +#endif and rTMP1, r7F7F, rWORD1 or rTMP2, r7F7F, rWORD1 add rTMP1, rTMP1, r7F7F - nor rTMP1, rTMP2, rTMP1 - and. rWORD1, rTMP1, rMASK + nor rTMP3, rTMP2, rTMP1 + and. rTMP3, rTMP3, rMASK mtcrf 0x01, rRTN bne L(done0) lis rFEFE, -0x101 @@ -110,11 +120,12 @@ ENTRY (strlen) bt 29, L(loop) /* Handle second word of pair. */ +/* Perhaps use method (1) here for little-endian, saving one instruction? */ lwzu rWORD1, 4(rSTR) and rTMP1, r7F7F, rWORD1 or rTMP2, r7F7F, rWORD1 add rTMP1, rTMP1, r7F7F - nor. rWORD1, rTMP2, rTMP1 + nor. rTMP3, rTMP2, rTMP1 bne L(done0) /* The loop. */ @@ -128,28 +139,52 @@ L(loop): add rTMP3, rFEFE, rWORD2 nor rTMP4, r7F7F, rWORD2 bne L(done1) - and. rTMP1, rTMP3, rTMP4 + and. rTMP3, rTMP3, rTMP4 beq L(loop) +#ifndef __LITTLE_ENDIAN__ and rTMP1, r7F7F, rWORD2 add rTMP1, rTMP1, r7F7F - andc rWORD1, rTMP4, rTMP1 + andc rTMP3, rTMP4, rTMP1 b L(done0) L(done1): and rTMP1, r7F7F, rWORD1 subi rSTR, rSTR, 4 add rTMP1, rTMP1, r7F7F - andc rWORD1, rTMP2, rTMP1 + andc rTMP3, rTMP2, rTMP1 /* When we get to here, rSTR points to the first word in the string that - contains a zero byte, and the most significant set bit in rWORD1 is in that - byte. */ + contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, + and 0x00 otherwise. */ L(done0): - cntlzw rTMP3, rWORD1 + cntlzw rTMP3, rTMP3 subf rTMP1, rRTN, rSTR srwi rTMP3, rTMP3, 3 add rRTN, rTMP1, rTMP3 blr +#else + +L(done0): + addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */ + andc rTMP1, rTMP1, rTMP3 + cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */ + subf rTMP3, rRTN, rSTR + subfic rTMP1, rTMP1, 32-7 + srwi rTMP1, rTMP1, 3 + add rRTN, rTMP1, rTMP3 + blr + +L(done1): + addi rTMP3, rTMP1, -1 + andc rTMP3, rTMP3, rTMP1 + cntlzw rTMP3, rTMP3 + subf rTMP1, rRTN, rSTR + subfic rTMP3, rTMP3, 32-7-32 + srawi rTMP3, rTMP3, 3 + add rRTN, rTMP1, rTMP3 + blr +#endif + END (strlen) libc_hidden_builtin_def (strlen) |