diff options
author | Luis Machado <luisgpm@br.ibm.com> | 2010-06-14 17:13:24 -0700 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-06-14 17:13:24 -0700 |
commit | 158db1226717c77f30ee2245e164f2e1f7721637 (patch) | |
tree | 6d7493e39ee07ca0384041b6455ed036d9f469ae /sysdeps/powerpc/powerpc64/power7 | |
parent | 4bc93b30337482a0ad1758b4ba069704d09c39da (diff) | |
download | glibc-158db1226717c77f30ee2245e164f2e1f7721637.tar.gz glibc-158db1226717c77f30ee2245e164f2e1f7721637.tar.xz glibc-158db1226717c77f30ee2245e164f2e1f7721637.zip |
power7 string compare optimizations
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/memcmp.S | 984 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/strncmp.S | 181 |
2 files changed, 1165 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S new file mode 100644 index 0000000000..f9b5c12cd6 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S @@ -0,0 +1,984 @@ +/* Optimized memcmp implementation for POWER7/PowerPC64. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +/* int [r3] memcmp (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + + .machine power7 +EALIGN (BP_SYM(memcmp),4,0) + CALL_MCOUNT 3 + +#define rTMP r0 +#define rRTN r3 +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rN r5 /* max string length */ +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ +#define rWORD1 r6 /* current word in s1 */ +#define rWORD2 r7 /* current word in s2 */ +#define rWORD3 r8 /* next word in s1 */ +#define rWORD4 r9 /* next word in s2 */ +#define rWORD5 r10 /* next word in s1 */ +#define rWORD6 r11 /* next word in s2 */ +#define rBITDIF r12 /* bits that differ in s1 & s2 words */ +#define rWORD7 r30 /* next word in s1 */ +#define rWORD8 r31 /* next word in s2 */ + + xor rTMP,rSTR2,rSTR1 + cmpldi cr6,rN,0 + cmpldi cr1,rN,12 + clrldi. rTMP,rTMP,61 + clrldi rBITDIF,rSTR1,61 + cmpldi cr5,rBITDIF,0 + beq- cr6,L(zeroLength) + dcbt 0,rSTR1 + dcbt 0,rSTR2 +/* If less than 8 bytes or not aligned, use the unalligned + byte loop. */ + blt cr1,L(bytealigned) + std rWORD8,-8(r1) + cfi_offset(rWORD8,-8) + std rWORD7,-16(r1) + cfi_offset(rWORD7,-16) + bne L(unaligned) +/* At this point we know both strings have the same alignment and the + compare length is at least 8 bytes. rBITDIF containes the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of rBITDIF to 0. If rBITDIF == 0 then we are already double word + aligned and can perform the DWaligned loop. + + Otherwise we know the two strings have the same alignment (but not + yet DW). So we can force the string addresses to the next lower DW + boundary and special case this first DW word using shift left to + ellimiate bits preceeding the first byte. Since we want to join the + normal (DWaligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This insures that the loop count is + correct and the first DW (shifted) is in the expected resister pair. */ + .align 4 +L(samealignment): + clrrdi rSTR1,rSTR1,3 + clrrdi rSTR2,rSTR2,3 + beq cr5,L(DWaligned) + add rN,rN,rBITDIF + sldi r11,rBITDIF,3 + srdi rTMP,rN,5 /* Divide by 32 */ + andi. rBITDIF,rN,24 /* Get the DW remainder */ + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + beq L(dPs4) + mtctr rTMP + bgt cr1,L(dPs3) + beq cr1,L(dPs2) + +/* Remainder is 8 */ + .align 3 +L(dsP1): + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD2,r11 + cmpld cr5,rWORD5,rWORD6 + blt cr7,L(dP1x) +/* Do something useful in this cycle since we have to branch anyway. */ + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 + b L(dP1e) +/* Remainder is 16 */ + .align 4 +L(dPs2): + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD2,r11 + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP2x) +/* Do something useful in this cycle since we have to branch anyway. */ + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 + b L(dP2e) +/* Remainder is 24 */ + .align 4 +L(dPs3): + sld rWORD3,rWORD1,r11 + sld rWORD4,rWORD2,r11 + cmpld cr1,rWORD3,rWORD4 + b L(dP3e) +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(dPs4): + mtctr rTMP + sld rWORD1,rWORD1,r11 + sld rWORD2,rWORD2,r11 + cmpld cr0,rWORD1,rWORD2 + b L(dP4e) + +/* At this point we know both strings are double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWaligned): + andi. rBITDIF,rN,24 /* Get the DW remainder */ + srdi rTMP,rN,5 /* Divide by 32 */ + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + beq L(dP4) + bgt cr1,L(dP3) + beq cr1,L(dP2) + +/* Remainder is 8 */ + .align 4 +L(dP1): + mtctr rTMP +/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early + (8-15 byte compare), we want to use only volitile registers. This + means we can avoid restoring non-volitile registers since we did not + change any on the early exit path. The key here is the non-early + exit path only cares about the condition code (cr5), not about which + register pair was used. */ + ld rWORD5,0(rSTR1) + ld rWORD6,0(rSTR2) + cmpld cr5,rWORD5,rWORD6 + blt cr7,L(dP1x) + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 +L(dP1e): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr1,rWORD3,rWORD4 + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) + bne cr0,L(dLcr0) + + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + bne cr1,L(dLcr1) + cmpld cr5,rWORD7,rWORD8 + bdnz L(dLoop) + bne cr6,L(dLcr6) + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + .align 3 +L(dP1x): + sldi. r12,rN,3 + bne cr5,L(dLcr5) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + li rRTN,0 + blr + +/* Remainder is 16 */ + .align 4 +L(dP2): + mtctr rTMP + ld rWORD5,0(rSTR1) + ld rWORD6,0(rSTR2) + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP2x) + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 +L(dP2e): + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr0,rWORD1,rWORD2 + ld rWORD3,24(rSTR1) + ld rWORD4,24(rSTR2) + cmpld cr1,rWORD3,rWORD4 + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr6,L(dLcr6) + bne cr5,L(dLcr5) + b L(dLoop2) +/* Again we are on a early exit path (16-23 byte compare), we want to + only use volitile registers and avoid restoring non-volitile + registers. */ + .align 4 +L(dP2x): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr5,rWORD3,rWORD4 + sldi. r12,rN,3 + bne cr6,L(dLcr6) + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr5,L(dLcr5) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + li rRTN,0 + blr + +/* Remainder is 24 */ + .align 4 +L(dP3): + mtctr rTMP + ld rWORD3,0(rSTR1) + ld rWORD4,0(rSTR2) + cmpld cr1,rWORD3,rWORD4 +L(dP3e): + ld rWORD5,8(rSTR1) + ld rWORD6,8(rSTR2) + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP3x) + ld rWORD7,16(rSTR1) + ld rWORD8,16(rSTR2) + cmpld cr5,rWORD7,rWORD8 + ld rWORD1,24(rSTR1) + ld rWORD2,24(rSTR2) + cmpld cr0,rWORD1,rWORD2 + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr1,L(dLcr1) + bne cr6,L(dLcr6) + b L(dLoop1) +/* Again we are on a early exit path (24-31 byte compare), we want to + only use volitile registers and avoid restoring non-volitile + registers. */ + .align 4 +L(dP3x): + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr5,rWORD1,rWORD2 + sldi. r12,rN,3 + bne cr1,L(dLcr1) + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr6,L(dLcr6) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne cr5,L(dLcr5) + bne L(d00) + li rRTN,0 + blr + +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(dP4): + mtctr rTMP + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpld cr0,rWORD1,rWORD2 +L(dP4e): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + ld rWORD5,16(rSTR1) + ld rWORD6,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + ldu rWORD7,24(rSTR1) + ldu rWORD8,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(dLcr0) + bne cr1,L(dLcr1) + bdz- L(d24) /* Adjust CTR as we start with +4 */ +/* This is the primary loop */ + .align 4 +L(dLoop): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(dLcr6) +L(dLoop1): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) +L(dLoop2): + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(dLcr0) +L(dLoop3): + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + bne cr1,L(dLcr1) + cmpld cr0,rWORD1,rWORD2 + bdnz L(dLoop) + +L(dL4): + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(dLcr6) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) + cmpld cr5,rWORD7,rWORD8 +L(d44): + bne cr0,L(dLcr0) +L(d34): + bne cr1,L(dLcr1) +L(d24): + bne cr6,L(dLcr6) +L(d14): + sldi. r12,rN,3 + bne cr5,L(dLcr5) +L(d04): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + beq L(zeroLength) +/* At this point we have a remainder of 1 to 7 bytes to compare. Since + we are aligned it is safe to load the whole double word, and use + shift right double to elliminate bits beyond the compare length. */ +L(d00): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + srd rWORD1,rWORD1,rN + srd rWORD2,rWORD2,rN + cmpld cr5,rWORD1,rWORD2 + bne cr5,L(dLcr5x) + li rRTN,0 + blr + .align 4 +L(dLcr0): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr0 + li rRTN,-1 + blr + .align 4 +L(dLcr1): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr1 + li rRTN,-1 + blr + .align 4 +L(dLcr6): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr6 + li rRTN,-1 + blr + .align 4 +L(dLcr5): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(dLcr5x): + li rRTN,1 + bgtlr cr5 + li rRTN,-1 + blr + + .align 4 +L(bytealigned): + mtctr rN + beq cr6,L(zeroLength) + +/* We need to prime this loop. This loop is swing modulo scheduled + to avoid pipe delays. The dependent instruction latencies (load to + compare to conditional branch) is 2 to 3 cycles. In this loop each + dispatch group ends in a branch and takes 1 cycle. Effectively + the first iteration of the loop only serves to load operands and + branches based on compares are delayed until the next loop. + + So we must precondition some registers and condition codes so that + we don't exit the loop early on the first iteration. */ + + lbz rWORD1,0(rSTR1) + lbz rWORD2,0(rSTR2) + bdz L(b11) + cmpld cr0,rWORD1,rWORD2 + lbz rWORD3,1(rSTR1) + lbz rWORD4,1(rSTR2) + bdz L(b12) + cmpld cr1,rWORD3,rWORD4 + lbzu rWORD5,2(rSTR1) + lbzu rWORD6,2(rSTR2) + bdz L(b13) + .align 4 +L(bLoop): + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + bne cr0,L(bLcr0) + + cmpld cr6,rWORD5,rWORD6 + bdz L(b3i) + + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne cr1,L(bLcr1) + + cmpld cr0,rWORD1,rWORD2 + bdz L(b2i) + + lbzu rWORD5,1(rSTR1) + lbzu rWORD6,1(rSTR2) + bne cr6,L(bLcr6) + + cmpld cr1,rWORD3,rWORD4 + bdnz L(bLoop) + +/* We speculatively loading bytes before we have tested the previous + bytes. But we must avoid overrunning the length (in the ctr) to + prevent these speculative loads from causing a segfault. In this + case the loop will exit early (before the all pending bytes are + tested. In this case we must complete the pending operations + before returning. */ +L(b1i): + bne cr0,L(bLcr0) + bne cr1,L(bLcr1) + b L(bx56) + .align 4 +L(b2i): + bne cr6,L(bLcr6) + bne cr0,L(bLcr0) + b L(bx34) + .align 4 +L(b3i): + bne cr1,L(bLcr1) + bne cr6,L(bLcr6) + b L(bx12) + .align 4 +L(bLcr0): + li rRTN,1 + bgtlr cr0 + li rRTN,-1 + blr +L(bLcr1): + li rRTN,1 + bgtlr cr1 + li rRTN,-1 + blr +L(bLcr6): + li rRTN,1 + bgtlr cr6 + li rRTN,-1 + blr + +L(b13): + bne cr0,L(bx12) + bne cr1,L(bx34) +L(bx56): + sub rRTN,rWORD5,rWORD6 + blr + nop +L(b12): + bne cr0,L(bx12) +L(bx34): + sub rRTN,rWORD3,rWORD4 + blr +L(b11): +L(bx12): + sub rRTN,rWORD1,rWORD2 + blr + .align 4 +L(zeroLengthReturn): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(zeroLength): + li rRTN,0 + blr + + .align 4 +/* At this point we know the strings have different alignment and the + compare length is at least 8 bytes. rBITDIF containes the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word + aligned and can perform the DWunaligned loop. + + Otherwise we know that rSTR1 is not aready DW aligned yet. + So we can force the string addresses to the next lower DW + boundary and special case this first DW word using shift left to + ellimiate bits preceeding the first byte. Since we want to join the + normal (DWaligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This insures that the loop count is + correct and the first DW (shifted) is in the expected resister pair. */ +#define rSHL r29 /* Unaligned shift left count. */ +#define rSHR r28 /* Unaligned shift right count. */ +#define rB r27 /* Left rotation temp for rWORD2. */ +#define rD r26 /* Left rotation temp for rWORD4. */ +#define rF r25 /* Left rotation temp for rWORD6. */ +#define rH r24 /* Left rotation temp for rWORD8. */ +#define rA r0 /* Right rotation temp for rWORD2. */ +#define rC r12 /* Right rotation temp for rWORD4. */ +#define rE r0 /* Right rotation temp for rWORD6. */ +#define rG r12 /* Right rotation temp for rWORD8. */ +L(unaligned): + std r29,-24(r1) + cfi_offset(r29,-24) + clrldi rSHL,rSTR2,61 + beq cr6,L(duzeroLength) + std r28,-32(r1) + cfi_offset(r28,-32) + beq cr5,L(DWunaligned) + std r27,-40(r1) + cfi_offset(r27,-40) +/* Adjust the logical start of rSTR2 ro compensate for the extra bits + in the 1st rSTR1 DW. */ + sub r27,rSTR2,rBITDIF +/* But do not attempt to address the DW before that DW that contains + the actual start of rSTR2. */ + clrrdi rSTR2,rSTR2,3 + std r26,-48(r1) + cfi_offset(r26,-48) +/* Compute the leaft/right shift counts for the unalign rSTR2, + compensating for the logical (DW aligned) start of rSTR1. */ + clrldi rSHL,r27,61 + clrrdi rSTR1,rSTR1,3 + std r25,-56(r1) + cfi_offset(r25,-56) + sldi rSHL,rSHL,3 + cmpld cr5,r27,rSTR2 + add rN,rN,rBITDIF + sldi r11,rBITDIF,3 + std r24,-64(r1) + cfi_offset(r24,-64) + subfic rSHR,rSHL,64 + srdi rTMP,rN,5 /* Divide by 32 */ + andi. rBITDIF,rN,24 /* Get the DW remainder */ +/* We normally need to load 2 DWs to start the unaligned rSTR2, but in + this special case those bits may be discarded anyway. Also we + must avoid loading a DW where none of the bits are part of rSTR2 as + this may cross a page boundary and cause a page fault. */ + li rWORD8,0 + blt cr5,L(dus0) + ld rWORD8,0(rSTR2) + la rSTR2,8(rSTR2) + sld rWORD8,rWORD8,rSHL + +L(dus0): + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + srd rG,rWORD2,rSHR + clrldi rN,rN,61 + beq L(duPs4) + mtctr rTMP + or rWORD8,rG,rWORD8 + bgt cr1,L(duPs3) + beq cr1,L(duPs2) + +/* Remainder is 8 */ + .align 4 +L(dusP1): + sld rB,rWORD2,rSHL + sld rWORD7,rWORD1,r11 + sld rWORD8,rWORD8,r11 + bge cr7,L(duP1e) +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ + cmpld cr5,rWORD7,rWORD8 + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) +/* Remainder is 16 */ + .align 4 +L(duPs2): + sld rH,rWORD2,rSHL + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD8,r11 + b L(duP2e) +/* Remainder is 24 */ + .align 4 +L(duPs3): + sld rF,rWORD2,rSHL + sld rWORD3,rWORD1,r11 + sld rWORD4,rWORD8,r11 + b L(duP3e) +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(duPs4): + mtctr rTMP + or rWORD8,rG,rWORD8 + sld rD,rWORD2,rSHL + sld rWORD1,rWORD1,r11 + sld rWORD2,rWORD8,r11 + b L(duP4e) + +/* At this point we know rSTR1 is double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWunaligned): + std r27,-40(r1) + cfi_offset(r27,-40) + clrrdi rSTR2,rSTR2,3 + std r26,-48(r1) + cfi_offset(r26,-48) + srdi rTMP,rN,5 /* Divide by 32 */ + std r25,-56(r1) + cfi_offset(r25,-56) + andi. rBITDIF,rN,24 /* Get the DW remainder */ + std r24,-64(r1) + cfi_offset(r24,-64) + sldi rSHL,rSHL,3 + ld rWORD6,0(rSTR2) + ldu rWORD8,8(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + subfic rSHR,rSHL,64 + sld rH,rWORD6,rSHL + beq L(duP4) + mtctr rTMP + bgt cr1,L(duP3) + beq cr1,L(duP2) + +/* Remainder is 8 */ + .align 4 +L(duP1): + srd rG,rWORD8,rSHR + ld rWORD7,0(rSTR1) + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP1x) +L(duP1e): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr0,rWORD1,rWORD2 + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + bne cr5,L(duLcr5) + or rWORD4,rC,rD + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr1,rWORD3,rWORD4 + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + bne cr0,L(duLcr0) + or rWORD6,rE,rF + cmpld cr6,rWORD5,rWORD6 + b L(duLoop3) + .align 4 +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ +L(duP1x): + cmpld cr5,rWORD7,rWORD8 + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) +/* Remainder is 16 */ + .align 4 +L(duP2): + srd rE,rWORD8,rSHR + ld rWORD5,0(rSTR1) + or rWORD6,rE,rH + sld rH,rWORD8,rSHL +L(duP2e): + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr6,rWORD5,rWORD6 + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP2x) + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + ld rWORD3,24(rSTR1) + ld rWORD4,24(rSTR2) + cmpld cr0,rWORD1,rWORD2 + bne cr5,L(duLcr5) + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + cmpld cr1,rWORD3,rWORD4 + b L(duLoop2) + .align 4 +L(duP2x): + cmpld cr5,rWORD7,rWORD8 + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr6,L(duLcr6) + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) + +/* Remainder is 24 */ + .align 4 +L(duP3): + srd rC,rWORD8,rSHR + ld rWORD3,0(rSTR1) + sld rF,rWORD8,rSHL + or rWORD4,rC,rH +L(duP3e): + ld rWORD5,8(rSTR1) + ld rWORD6,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF + ld rWORD7,16(rSTR1) + ld rWORD8,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP3x) + ld rWORD1,24(rSTR1) + ld rWORD2,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + cmpld cr0,rWORD1,rWORD2 + b L(duLoop1) + .align 4 +L(duP3x): + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr1,L(duLcr1) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) + +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(duP4): + mtctr rTMP + srd rA,rWORD8,rSHR + ld rWORD1,0(rSTR1) + sld rD,rWORD8,rSHL + or rWORD2,rA,rH +L(duP4e): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD + ld rWORD5,16(rSTR1) + ld rWORD6,16(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr0,L(duLcr0) + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF + ldu rWORD7,24(rSTR1) + ldu rWORD8,24(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + cmpld cr5,rWORD7,rWORD8 + bdz L(du24) /* Adjust CTR as we start with +4 */ +/* This is the primary loop */ + .align 4 +L(duLoop): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB +L(duLoop1): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(duLcr5) + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD +L(duLoop2): + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(duLcr0) + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF +L(duLoop3): + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + cmpld cr0,rWORD1,rWORD2 + bne- cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + bdnz L(duLoop) + +L(duL4): + bne cr1,L(duLcr1) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(duLcr6) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(duLcr5) + cmpld cr5,rWORD7,rWORD8 +L(du44): + bne cr0,L(duLcr0) +L(du34): + bne cr1,L(duLcr1) +L(du24): + bne cr6,L(duLcr6) +L(du14): + sldi. rN,rN,3 + bne cr5,L(duLcr5) +/* At this point we have a remainder of 1 to 7 bytes to compare. We use + shift right double to elliminate bits beyond the compare length. + This allows the use of double word subtract to compute the final + result. + + However it may not be safe to load rWORD2 which may be beyond the + string length. So we compare the bit length of the remainder to + the right shift count (rSHR). If the bit count is less than or equal + we do not need to load rWORD2 (all significant bits are already in + rB). */ + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + .align 4 +L(dutrim): + ld rWORD1,8(rSTR1) + ld rWORD8,-8(r1) + subfic rN,rN,64 /* Shift count is 64 - (rN * 8). */ + or rWORD2,rA,rB + ld rWORD7,-16(r1) + ld r29,-24(r1) + srd rWORD1,rWORD1,rN + srd rWORD2,rWORD2,rN + ld r28,-32(r1) + ld r27,-40(r1) + li rRTN,0 + cmpld cr0,rWORD1,rWORD2 + ld r26,-48(r1) + ld r25,-56(r1) + beq cr0,L(dureturn24) + li rRTN,1 + ld r24,-64(r1) + bgtlr cr0 + li rRTN,-1 + blr + .align 4 +L(duLcr0): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr0,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr1): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr1,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr6): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr6,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr5): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr5,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 3 +L(duZeroReturn): + li rRTN,0 + .align 4 +L(dureturn): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(dureturn29): + ld r29,-24(r1) + ld r28,-32(r1) +L(dureturn27): + ld r27,-40(r1) +L(dureturn26): + ld r26,-48(r1) +L(dureturn25): + ld r25,-56(r1) +L(dureturn24): + ld r24,-64(r1) + blr +L(duzeroLength): + li rRTN,0 + blr + +END (BP_SYM (memcmp)) +libc_hidden_builtin_def (memcmp) +weak_alias (memcmp,bcmp) diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S new file mode 100644 index 0000000000..e32920e219 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S @@ -0,0 +1,181 @@ +/* Optimized strcmp implementation for POWER7/PowerPC64. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +/* See strlen.s for comments on how the end-of-string testing works. */ + +/* int [r3] strncmp (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + +EALIGN (BP_SYM(strncmp),4,0) + CALL_MCOUNT 3 + +#define rTMP r0 +#define rRTN r3 +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rN r5 /* max string length */ +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ +#define rWORD1 r6 /* current word in s1 */ +#define rWORD2 r7 /* current word in s2 */ +#define rWORD3 r10 +#define rWORD4 r11 +#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ +#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */ +#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */ +#define rBITDIF r11 /* bits that differ in s1 & s2 words */ + + dcbt 0,rSTR1 + or rTMP,rSTR2,rSTR1 + lis r7F7F,0x7f7f + dcbt 0,rSTR2 + clrldi. rTMP,rTMP,61 + cmpldi cr1,rN,0 + lis rFEFE,-0x101 + bne L(unaligned) +/* We are doubleword alligned so set up for two loops. first a double word + loop, then fall into the byte loop if any residual. */ + srdi. rTMP,rN,3 + clrldi rN,rN,61 + addi rFEFE,rFEFE,-0x101 + addi r7F7F,r7F7F,0x7f7f + cmpldi cr1,rN,0 + beq L(unaligned) + + mtctr rTMP + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + sldi rTMP,rFEFE,32 + insrdi r7F7F,r7F7F,32,0 + add rFEFE,rFEFE,rTMP + b L(g1) + +L(g0): + ldu rWORD1,8(rSTR1) + bne cr1,L(different) + ldu rWORD2,8(rSTR2) +L(g1): add rTMP,rFEFE,rWORD1 + nor rNEG,r7F7F,rWORD1 + bdz L(tail) + and. rTMP,rTMP,rNEG + cmpd cr1,rWORD1,rWORD2 + beq L(g0) + +/* OK. We've hit the end of the string. We need to be careful that + we don't compare two strings as different because of gunk beyond + the end of the strings... */ + +L(endstring): + and rTMP,r7F7F,rWORD1 + beq cr1,L(equal) + add rTMP,rTMP,r7F7F + xor. rBITDIF,rWORD1,rWORD2 + + andc rNEG,rNEG,rTMP + blt L(highbit) + cntlzd rBITDIF,rBITDIF + cntlzd rNEG,rNEG + addi rNEG,rNEG,7 + cmpd cr1,rNEG,rBITDIF + sub rRTN,rWORD1,rWORD2 + blt cr1,L(equal) + sradi rRTN,rRTN,63 + ori rRTN,rRTN,1 + blr +L(equal): + li rRTN,0 + blr + +L(different): + ldu rWORD1,-8(rSTR1) + xor. rBITDIF,rWORD1,rWORD2 + sub rRTN,rWORD1,rWORD2 + blt L(highbit) + sradi rRTN,rRTN,63 + ori rRTN,rRTN,1 + blr +L(highbit): + srdi rWORD2,rWORD2,56 + srdi rWORD1,rWORD1,56 + sub rRTN,rWORD1,rWORD2 + blr + + +/* Oh well. In this case, we just do a byte-by-byte comparison. */ + .align 4 +L(tail): + and. rTMP,rTMP,rNEG + cmpd cr1,rWORD1,rWORD2 + bne L(endstring) + addi rSTR1,rSTR1,8 + bne cr1,L(different) + addi rSTR2,rSTR2,8 + cmpldi cr1,rN,0 +L(unaligned): + mtctr rN + ble cr1,L(ux) +L(uz): + lbz rWORD1,0(rSTR1) + lbz rWORD2,0(rSTR2) + .align 4 +L(u1): + cmpdi cr1,rWORD1,0 + bdz L(u4) + cmpd rWORD1,rWORD2 + beq cr1,L(u4) + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne L(u4) + cmpdi cr1,rWORD3,0 + bdz L(u3) + cmpd rWORD3,rWORD4 + beq cr1,L(u3) + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + bne L(u3) + cmpdi cr1,rWORD1,0 + bdz L(u4) + cmpd rWORD1,rWORD2 + beq cr1,L(u4) + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne L(u4) + cmpdi cr1,rWORD3,0 + bdz L(u3) + cmpd rWORD3,rWORD4 + beq cr1,L(u3) + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + beq L(u1) + +L(u3): sub rRTN,rWORD3,rWORD4 + blr +L(u4): sub rRTN,rWORD1,rWORD2 + blr +L(ux): + li rRTN,0 + blr +END (BP_SYM (strncmp)) +libc_hidden_builtin_def (strncmp) |